Fix the first submission for the task case and tidy up code a bit
Browse files- src/submission_uploader.py +33 -38
src/submission_uploader.py
CHANGED
@@ -3,7 +3,7 @@ import logging
|
|
3 |
import os
|
4 |
import time
|
5 |
from tempfile import TemporaryDirectory
|
6 |
-
from typing import
|
7 |
|
8 |
import jsonlines
|
9 |
from huggingface_hub import CommitOperationAdd # type: ignore[import]
|
@@ -30,13 +30,13 @@ class SubmissionUploader:
|
|
30 |
def __init__(self, dataset_id: str, private_dataset_id: str):
|
31 |
self._api = HfApi(token=os.environ["HF_TOKEN"])
|
32 |
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
|
33 |
-
self.
|
34 |
-
self.
|
35 |
|
36 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
37 |
-
"""Searches among discussions of dataset
|
38 |
try:
|
39 |
-
discussions = self._api.get_repo_discussions(repo_id=self.
|
40 |
except Exception:
|
41 |
return None
|
42 |
for discussion in discussions:
|
@@ -44,22 +44,6 @@ class SubmissionUploader:
|
|
44 |
return discussion
|
45 |
return None
|
46 |
|
47 |
-
def _get_metadata(
|
48 |
-
self,
|
49 |
-
model_name_pretty: str,
|
50 |
-
model_availability: str,
|
51 |
-
urls: Optional[str],
|
52 |
-
context_size: str,
|
53 |
-
submitted_by: str,
|
54 |
-
) -> Dict[str, Optional[str]]:
|
55 |
-
return {
|
56 |
-
"model_name": model_name_pretty,
|
57 |
-
"model_availability": model_availability,
|
58 |
-
"urls": urls,
|
59 |
-
"context_size": context_size,
|
60 |
-
"submitted_by": submitted_by,
|
61 |
-
}
|
62 |
-
|
63 |
def _upload_request(
|
64 |
self,
|
65 |
task_id: str,
|
@@ -74,6 +58,7 @@ class SubmissionUploader:
|
|
74 |
pr_url: str,
|
75 |
temp_directory: str,
|
76 |
) -> List[CommitOperationAdd]:
|
|
|
77 |
request_metadata = {
|
78 |
"model_folder": model_folder,
|
79 |
"model_name_pretty": model_name_pretty,
|
@@ -90,7 +75,11 @@ class SubmissionUploader:
|
|
90 |
with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
|
91 |
json.dump(request_metadata, f)
|
92 |
|
93 |
-
num_requests_already_present =
|
|
|
|
|
|
|
|
|
94 |
commit_operations = [
|
95 |
CommitOperationAdd(
|
96 |
path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
|
@@ -105,6 +94,7 @@ class SubmissionUploader:
|
|
105 |
model_folder: str,
|
106 |
filenames: List[str],
|
107 |
) -> List[CommitOperationAdd]:
|
|
|
108 |
commit_operations = [
|
109 |
CommitOperationAdd(
|
110 |
path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
|
@@ -115,6 +105,7 @@ class SubmissionUploader:
|
|
115 |
return commit_operations
|
116 |
|
117 |
def _compute_metrics_for_predictions(self, task_id: str, filenames: List[str], temp_directory: str) -> None:
|
|
|
118 |
metrics_module = METRICS[task_id]
|
119 |
assert metrics_module is not None, f"Computing metrics for {task_id} is not supported."
|
120 |
metrics_module.reset()
|
@@ -153,18 +144,20 @@ class SubmissionUploader:
|
|
153 |
submitted_by: str,
|
154 |
temp_directory: str,
|
155 |
) -> List[CommitOperationAdd]:
|
|
|
156 |
final_results = {}
|
157 |
with open(os.path.join(temp_directory, "final_metrics.json"), "r") as f:
|
158 |
metrics = json.load(f)
|
159 |
final_results.update(metrics)
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
166 |
)
|
167 |
-
final_results.update(metadata_dict)
|
168 |
|
169 |
with jsonlines.open(os.path.join(temp_directory, "final_results.jsonl"), "w") as writer:
|
170 |
writer.write(final_results)
|
@@ -189,6 +182,7 @@ class SubmissionUploader:
|
|
189 |
comment: Optional[str],
|
190 |
filenames: Optional[List[str]],
|
191 |
):
|
|
|
192 |
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
193 |
assert model_folder, "Please, specify non-empty name for a directory with a model's results."
|
194 |
assert model_name_pretty, "Please, specify non-empty name for a model."
|
@@ -238,15 +232,17 @@ class SubmissionUploader:
|
|
238 |
|
239 |
logging.info("Checking if this request has already been submitted...")
|
240 |
if not force:
|
241 |
-
if
|
242 |
return styled_warning(
|
243 |
-
f"{model_folder} is already present in {self.
|
244 |
)
|
245 |
|
246 |
prev_pr = self._get_previous_pr(pr_title)
|
247 |
if prev_pr is not None:
|
248 |
-
url = f"https://huggingface.co/datasets/{self.
|
249 |
-
return styled_warning(
|
|
|
|
|
250 |
|
251 |
logging.info("Processing predictions...")
|
252 |
predictions_commit_operations = self._upload_predictions(
|
@@ -271,9 +267,9 @@ class SubmissionUploader:
|
|
271 |
temp_directory=str(d),
|
272 |
)
|
273 |
|
274 |
-
logging.info(
|
275 |
new_pr = self._api.create_commit(
|
276 |
-
repo_id=self.
|
277 |
operations=predictions_commit_operations + results_commit_operations,
|
278 |
commit_message=pr_title,
|
279 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
|
@@ -281,7 +277,7 @@ class SubmissionUploader:
|
|
281 |
repo_type="dataset",
|
282 |
)
|
283 |
|
284 |
-
logging.info(
|
285 |
request_commit_operations = self._upload_request(
|
286 |
task_id=task_id,
|
287 |
model_folder=model_folder,
|
@@ -296,7 +292,7 @@ class SubmissionUploader:
|
|
296 |
pr_url=new_pr.pr_url,
|
297 |
)
|
298 |
self._api.create_commit(
|
299 |
-
repo_id=self.
|
300 |
operations=request_commit_operations,
|
301 |
commit_message=pr_title,
|
302 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
|
@@ -307,7 +303,6 @@ class SubmissionUploader:
|
|
307 |
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
308 |
|
309 |
except Exception as e:
|
310 |
-
logging.exception(e)
|
311 |
exception_msg = str(e)
|
312 |
if exception_msg and os.environ["PRIVATE_DATASET_ID"] in exception_msg:
|
313 |
exception_msg = exception_msg.replace(os.environ["PRIVATE_DATASET_ID"], "{private_dataset}")
|
|
|
3 |
import os
|
4 |
import time
|
5 |
from tempfile import TemporaryDirectory
|
6 |
+
from typing import List, Optional
|
7 |
|
8 |
import jsonlines
|
9 |
from huggingface_hub import CommitOperationAdd # type: ignore[import]
|
|
|
30 |
def __init__(self, dataset_id: str, private_dataset_id: str):
|
31 |
self._api = HfApi(token=os.environ["HF_TOKEN"])
|
32 |
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
|
33 |
+
self._results_dataset_id = dataset_id
|
34 |
+
self._requests_dataset_id = private_dataset_id
|
35 |
|
36 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
37 |
+
"""Searches among discussions of the results dataset for a PR with the given title."""
|
38 |
try:
|
39 |
+
discussions = self._api.get_repo_discussions(repo_id=self._results_dataset_id, repo_type="dataset")
|
40 |
except Exception:
|
41 |
return None
|
42 |
for discussion in discussions:
|
|
|
44 |
return discussion
|
45 |
return None
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def _upload_request(
|
48 |
self,
|
49 |
task_id: str,
|
|
|
58 |
pr_url: str,
|
59 |
temp_directory: str,
|
60 |
) -> List[CommitOperationAdd]:
|
61 |
+
"""Adds a file with metadata about the current request to the requests dataset."""
|
62 |
request_metadata = {
|
63 |
"model_folder": model_folder,
|
64 |
"model_name_pretty": model_name_pretty,
|
|
|
75 |
with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
|
76 |
json.dump(request_metadata, f)
|
77 |
|
78 |
+
num_requests_already_present = (
|
79 |
+
len(self._fs.ls(f"datasets/{self._requests_dataset_id}/{task_id}/"))
|
80 |
+
if self._fs.isdir(f"datasets/{self._requests_dataset_id}/{task_id}/")
|
81 |
+
else 0
|
82 |
+
)
|
83 |
commit_operations = [
|
84 |
CommitOperationAdd(
|
85 |
path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
|
|
|
94 |
model_folder: str,
|
95 |
filenames: List[str],
|
96 |
) -> List[CommitOperationAdd]:
|
97 |
+
"""Adds all files with current model's predictions to the results dataset."""
|
98 |
commit_operations = [
|
99 |
CommitOperationAdd(
|
100 |
path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
|
|
|
105 |
return commit_operations
|
106 |
|
107 |
def _compute_metrics_for_predictions(self, task_id: str, filenames: List[str], temp_directory: str) -> None:
|
108 |
+
"""Computes metrics for each submitted file with the current model's predictions."""
|
109 |
metrics_module = METRICS[task_id]
|
110 |
assert metrics_module is not None, f"Computing metrics for {task_id} is not supported."
|
111 |
metrics_module.reset()
|
|
|
144 |
submitted_by: str,
|
145 |
temp_directory: str,
|
146 |
) -> List[CommitOperationAdd]:
|
147 |
+
"""Adds files with the current model's metrics values to the results dataset."""
|
148 |
final_results = {}
|
149 |
with open(os.path.join(temp_directory, "final_metrics.json"), "r") as f:
|
150 |
metrics = json.load(f)
|
151 |
final_results.update(metrics)
|
152 |
+
final_results.update(
|
153 |
+
{
|
154 |
+
"model_name": model_name_pretty,
|
155 |
+
"model_availability": model_availability,
|
156 |
+
"urls": urls,
|
157 |
+
"context_size": context_size,
|
158 |
+
"submitted_by": submitted_by,
|
159 |
+
}
|
160 |
)
|
|
|
161 |
|
162 |
with jsonlines.open(os.path.join(temp_directory, "final_results.jsonl"), "w") as writer:
|
163 |
writer.write(final_results)
|
|
|
182 |
comment: Optional[str],
|
183 |
filenames: Optional[List[str]],
|
184 |
):
|
185 |
+
"""Verifies that all necessary arguments are not None (and also runs other sanity checks)."""
|
186 |
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
187 |
assert model_folder, "Please, specify non-empty name for a directory with a model's results."
|
188 |
assert model_name_pretty, "Please, specify non-empty name for a model."
|
|
|
232 |
|
233 |
logging.info("Checking if this request has already been submitted...")
|
234 |
if not force:
|
235 |
+
if self._fs.isdir(f"datasets/{self._results_dataset_id}/{task_id}/predictions/{model_folder}"):
|
236 |
return styled_warning(
|
237 |
+
f"{model_folder} is already present in {self._results_dataset_id}, please, select another folder name."
|
238 |
)
|
239 |
|
240 |
prev_pr = self._get_previous_pr(pr_title)
|
241 |
if prev_pr is not None:
|
242 |
+
url = f"https://huggingface.co/datasets/{self._results_dataset_id}/discussions/{prev_pr.num}"
|
243 |
+
return styled_warning(
|
244 |
+
f"{self._results_dataset_id} already has an open PR for this submission: {url}."
|
245 |
+
)
|
246 |
|
247 |
logging.info("Processing predictions...")
|
248 |
predictions_commit_operations = self._upload_predictions(
|
|
|
267 |
temp_directory=str(d),
|
268 |
)
|
269 |
|
270 |
+
logging.info("Creating commit to the results dataset...")
|
271 |
new_pr = self._api.create_commit(
|
272 |
+
repo_id=self._results_dataset_id,
|
273 |
operations=predictions_commit_operations + results_commit_operations,
|
274 |
commit_message=pr_title,
|
275 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
|
|
|
277 |
repo_type="dataset",
|
278 |
)
|
279 |
|
280 |
+
logging.info("Creating commit to the requests dataset...")
|
281 |
request_commit_operations = self._upload_request(
|
282 |
task_id=task_id,
|
283 |
model_folder=model_folder,
|
|
|
292 |
pr_url=new_pr.pr_url,
|
293 |
)
|
294 |
self._api.create_commit(
|
295 |
+
repo_id=self._requests_dataset_id,
|
296 |
operations=request_commit_operations,
|
297 |
commit_message=pr_title,
|
298 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
|
|
|
303 |
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
304 |
|
305 |
except Exception as e:
|
|
|
306 |
exception_msg = str(e)
|
307 |
if exception_msg and os.environ["PRIVATE_DATASET_ID"] in exception_msg:
|
308 |
exception_msg = exception_msg.replace(os.environ["PRIVATE_DATASET_ID"], "{private_dataset}")
|