Add a separate dataset for aggregating requests metadata
Browse files- app.py +6 -2
- src/submission_uploader.py +83 -12
app.py
CHANGED
@@ -30,7 +30,9 @@ logging.basicConfig(
|
|
30 |
handlers=[logging.StreamHandler()],
|
31 |
)
|
32 |
|
33 |
-
submission_uploader = SubmissionUploader(
|
|
|
|
|
34 |
|
35 |
|
36 |
with gr.Blocks() as demo:
|
@@ -61,7 +63,7 @@ with gr.Blocks() as demo:
|
|
61 |
with gr.Column():
|
62 |
model_folder_textbox = gr.Textbox(
|
63 |
label="Model Folder",
|
64 |
-
placeholder="How to call a folder related to this submission in our results dataset.",
|
65 |
)
|
66 |
model_name_textbox = gr.Textbox(
|
67 |
label="Model Name",
|
@@ -111,6 +113,8 @@ with gr.Blocks() as demo:
|
|
111 |
url_textbox,
|
112 |
context_size_textbox,
|
113 |
submitted_by_textbox,
|
|
|
|
|
114 |
file_output,
|
115 |
],
|
116 |
submission_result,
|
|
|
30 |
handlers=[logging.StreamHandler()],
|
31 |
)
|
32 |
|
33 |
+
submission_uploader = SubmissionUploader(
|
34 |
+
dataset_id=os.environ["DATASET_ID"], private_dataset_id=os.environ["PRIVATE_DATASET_ID"]
|
35 |
+
)
|
36 |
|
37 |
|
38 |
with gr.Blocks() as demo:
|
|
|
63 |
with gr.Column():
|
64 |
model_folder_textbox = gr.Textbox(
|
65 |
label="Model Folder",
|
66 |
+
placeholder="How to call a folder related to this submission in our results dataset (should be unique).",
|
67 |
)
|
68 |
model_name_textbox = gr.Textbox(
|
69 |
label="Model Name",
|
|
|
113 |
url_textbox,
|
114 |
context_size_textbox,
|
115 |
submitted_by_textbox,
|
116 |
+
contact_textbox,
|
117 |
+
comment_textbox,
|
118 |
file_output,
|
119 |
],
|
120 |
submission_result,
|
src/submission_uploader.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import json
|
2 |
import logging
|
3 |
import os
|
|
|
4 |
from tempfile import TemporaryDirectory
|
5 |
from typing import Dict, List, Optional
|
6 |
|
@@ -26,10 +27,11 @@ class SubmissionUploader:
|
|
26 |
* https://huggingface.co/spaces/gaia-benchmark/leaderboard
|
27 |
"""
|
28 |
|
29 |
-
def __init__(self, dataset_id: str):
|
30 |
self._api = HfApi(token=os.environ["HF_TOKEN"])
|
31 |
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
|
32 |
self._dataset_id = dataset_id
|
|
|
33 |
|
34 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
35 |
"""Searches among discussions of dataset repo for a PR with the given title."""
|
@@ -46,10 +48,10 @@ class SubmissionUploader:
|
|
46 |
self,
|
47 |
model_name_pretty: str,
|
48 |
model_availability: str,
|
49 |
-
urls: str,
|
50 |
context_size: str,
|
51 |
submitted_by: str,
|
52 |
-
) -> Dict[str, str]:
|
53 |
return {
|
54 |
"model_name": model_name_pretty,
|
55 |
"model_availability": model_availability,
|
@@ -58,6 +60,45 @@ class SubmissionUploader:
|
|
58 |
"submitted_by": submitted_by,
|
59 |
}
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def _upload_predictions(
|
62 |
self,
|
63 |
task_id: str,
|
@@ -107,7 +148,7 @@ class SubmissionUploader:
|
|
107 |
model_folder: str,
|
108 |
model_name_pretty: str,
|
109 |
model_availability: str,
|
110 |
-
urls: str,
|
111 |
context_size: str,
|
112 |
submitted_by: str,
|
113 |
temp_directory: str,
|
@@ -141,9 +182,11 @@ class SubmissionUploader:
|
|
141 |
model_folder: str,
|
142 |
model_name_pretty: str,
|
143 |
model_availability: str,
|
144 |
-
urls: str,
|
145 |
context_size: str,
|
146 |
submitted_by: str,
|
|
|
|
|
147 |
filenames: Optional[List[str]],
|
148 |
):
|
149 |
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
@@ -158,6 +201,7 @@ class SubmissionUploader:
|
|
158 |
|
159 |
assert submitted_by, "Please, specify non-empty information about a submission's author(s)."
|
160 |
assert filenames, "Please, attach at least one file with predictions."
|
|
|
161 |
|
162 |
def upload_files(
|
163 |
self,
|
@@ -165,9 +209,11 @@ class SubmissionUploader:
|
|
165 |
model_folder: str,
|
166 |
model_name_pretty: str,
|
167 |
model_availability: str,
|
168 |
-
urls: str,
|
169 |
context_size: str,
|
170 |
submitted_by: str,
|
|
|
|
|
171 |
filenames: Optional[List[str]],
|
172 |
force: bool = False,
|
173 |
) -> str:
|
@@ -180,6 +226,8 @@ class SubmissionUploader:
|
|
180 |
urls=urls,
|
181 |
context_size=context_size,
|
182 |
submitted_by=submitted_by,
|
|
|
|
|
183 |
filenames=filenames,
|
184 |
)
|
185 |
pr_title = f"π New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
|
@@ -190,11 +238,10 @@ class SubmissionUploader:
|
|
190 |
|
191 |
logging.info("Checking if this request has already been submitted...")
|
192 |
if not force:
|
193 |
-
if
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
return styled_warning(f"{model_name_pretty} is already present in {self._dataset_id}.")
|
198 |
|
199 |
prev_pr = self._get_previous_pr(pr_title)
|
200 |
if prev_pr is not None:
|
@@ -224,7 +271,7 @@ class SubmissionUploader:
|
|
224 |
temp_directory=str(d),
|
225 |
)
|
226 |
|
227 |
-
logging.info("Creating commit...")
|
228 |
new_pr = self._api.create_commit(
|
229 |
repo_id=self._dataset_id,
|
230 |
operations=predictions_commit_operations + results_commit_operations,
|
@@ -233,6 +280,30 @@ class SubmissionUploader:
|
|
233 |
create_pr=True,
|
234 |
repo_type="dataset",
|
235 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
237 |
|
238 |
except Exception as e:
|
|
|
1 |
import json
|
2 |
import logging
|
3 |
import os
|
4 |
+
import time
|
5 |
from tempfile import TemporaryDirectory
|
6 |
from typing import Dict, List, Optional
|
7 |
|
|
|
27 |
* https://huggingface.co/spaces/gaia-benchmark/leaderboard
|
28 |
"""
|
29 |
|
30 |
+
def __init__(self, dataset_id: str, private_dataset_id: str):
|
31 |
self._api = HfApi(token=os.environ["HF_TOKEN"])
|
32 |
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
|
33 |
self._dataset_id = dataset_id
|
34 |
+
self._private_dataset_id = private_dataset_id
|
35 |
|
36 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
37 |
"""Searches among discussions of dataset repo for a PR with the given title."""
|
|
|
48 |
self,
|
49 |
model_name_pretty: str,
|
50 |
model_availability: str,
|
51 |
+
urls: Optional[str],
|
52 |
context_size: str,
|
53 |
submitted_by: str,
|
54 |
+
) -> Dict[str, Optional[str]]:
|
55 |
return {
|
56 |
"model_name": model_name_pretty,
|
57 |
"model_availability": model_availability,
|
|
|
60 |
"submitted_by": submitted_by,
|
61 |
}
|
62 |
|
63 |
+
def _upload_request(
|
64 |
+
self,
|
65 |
+
task_id: str,
|
66 |
+
model_folder: str,
|
67 |
+
model_name_pretty: str,
|
68 |
+
model_availability: str,
|
69 |
+
urls: Optional[str],
|
70 |
+
context_size: str,
|
71 |
+
submitted_by: str,
|
72 |
+
contact_information: str,
|
73 |
+
comment: Optional[str],
|
74 |
+
pr_url: str,
|
75 |
+
temp_directory: str,
|
76 |
+
) -> List[CommitOperationAdd]:
|
77 |
+
request_metadata = {
|
78 |
+
"model_folder": model_folder,
|
79 |
+
"model_name_pretty": model_name_pretty,
|
80 |
+
"model_availability": model_availability,
|
81 |
+
"urls": urls,
|
82 |
+
"context_size": context_size,
|
83 |
+
"submitted_by": submitted_by,
|
84 |
+
"contact_information": contact_information,
|
85 |
+
"comment": comment,
|
86 |
+
"timestamp": time.time(),
|
87 |
+
"pr_url": pr_url,
|
88 |
+
}
|
89 |
+
|
90 |
+
with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
|
91 |
+
json.dump(request_metadata, f)
|
92 |
+
|
93 |
+
num_requests_already_present = len(self._fs.ls(f"datasets/{self._private_dataset_id}/{task_id}/"))
|
94 |
+
commit_operations = [
|
95 |
+
CommitOperationAdd(
|
96 |
+
path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
|
97 |
+
path_or_fileobj=os.path.join(temp_directory, "request_metadata.json"),
|
98 |
+
)
|
99 |
+
]
|
100 |
+
return commit_operations
|
101 |
+
|
102 |
def _upload_predictions(
|
103 |
self,
|
104 |
task_id: str,
|
|
|
148 |
model_folder: str,
|
149 |
model_name_pretty: str,
|
150 |
model_availability: str,
|
151 |
+
urls: Optional[str],
|
152 |
context_size: str,
|
153 |
submitted_by: str,
|
154 |
temp_directory: str,
|
|
|
182 |
model_folder: str,
|
183 |
model_name_pretty: str,
|
184 |
model_availability: str,
|
185 |
+
urls: Optional[str],
|
186 |
context_size: str,
|
187 |
submitted_by: str,
|
188 |
+
contact_information: str,
|
189 |
+
comment: Optional[str],
|
190 |
filenames: Optional[List[str]],
|
191 |
):
|
192 |
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
|
|
201 |
|
202 |
assert submitted_by, "Please, specify non-empty information about a submission's author(s)."
|
203 |
assert filenames, "Please, attach at least one file with predictions."
|
204 |
+
assert contact_information, "Please, fill in the field with contact information."
|
205 |
|
206 |
def upload_files(
|
207 |
self,
|
|
|
209 |
model_folder: str,
|
210 |
model_name_pretty: str,
|
211 |
model_availability: str,
|
212 |
+
urls: Optional[str],
|
213 |
context_size: str,
|
214 |
submitted_by: str,
|
215 |
+
contact_information: str,
|
216 |
+
comment: Optional[str],
|
217 |
filenames: Optional[List[str]],
|
218 |
force: bool = False,
|
219 |
) -> str:
|
|
|
226 |
urls=urls,
|
227 |
context_size=context_size,
|
228 |
submitted_by=submitted_by,
|
229 |
+
contact_information=contact_information,
|
230 |
+
comment=comment,
|
231 |
filenames=filenames,
|
232 |
)
|
233 |
pr_title = f"π New submission to {task_pretty} task: {model_name_pretty} with {context_size} context size from {submitted_by}"
|
|
|
238 |
|
239 |
logging.info("Checking if this request has already been submitted...")
|
240 |
if not force:
|
241 |
+
if model_folder in self._fs.ls(f"datasets/{self._dataset_id}/{task_id}/predictions"):
|
242 |
+
return styled_warning(
|
243 |
+
f"{model_folder} is already present in {self._dataset_id}, please, select another folder name."
|
244 |
+
)
|
|
|
245 |
|
246 |
prev_pr = self._get_previous_pr(pr_title)
|
247 |
if prev_pr is not None:
|
|
|
271 |
temp_directory=str(d),
|
272 |
)
|
273 |
|
274 |
+
logging.info(f"Creating commit to results dataset...")
|
275 |
new_pr = self._api.create_commit(
|
276 |
repo_id=self._dataset_id,
|
277 |
operations=predictions_commit_operations + results_commit_operations,
|
|
|
280 |
create_pr=True,
|
281 |
repo_type="dataset",
|
282 |
)
|
283 |
+
|
284 |
+
logging.info(f"Creating commit to requests dataset...")
|
285 |
+
request_commit_operations = self._upload_request(
|
286 |
+
task_id=task_id,
|
287 |
+
model_folder=model_folder,
|
288 |
+
temp_directory=str(d),
|
289 |
+
model_name_pretty=model_name_pretty,
|
290 |
+
model_availability=model_availability,
|
291 |
+
urls=urls,
|
292 |
+
context_size=context_size,
|
293 |
+
submitted_by=submitted_by,
|
294 |
+
contact_information=contact_information,
|
295 |
+
comment=comment,
|
296 |
+
pr_url=new_pr.pr_url,
|
297 |
+
)
|
298 |
+
self._api.create_commit(
|
299 |
+
repo_id=self._private_dataset_id,
|
300 |
+
operations=request_commit_operations,
|
301 |
+
commit_message=pr_title,
|
302 |
+
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
|
303 |
+
create_pr=True,
|
304 |
+
repo_type="dataset",
|
305 |
+
)
|
306 |
+
|
307 |
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
308 |
|
309 |
except Exception as e:
|