lewtun HF staff commited on
Commit
1b95f45
β€’
1 Parent(s): 0ef1d60

Refactor to mathc new AutoTrain API

Browse files
Files changed (5) hide show
  1. .gitignore +4 -1
  2. Makefile +8 -0
  3. app.py +84 -30
  4. requirements.txt +1 -1
  5. utils.py +14 -6
.gitignore CHANGED
@@ -143,4 +143,7 @@ cython_debug/
143
  # Submissions
144
  submission_repo/
145
  GEM-outputs/
146
- sample-submissions/
 
 
 
 
143
  # Submissions
144
  submission_repo/
145
  GEM-outputs/
146
+ sample-submissions/
147
+
148
+ # mac OS
149
+ .DS_Store
Makefile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ style:
2
+ python -m black --line-length 119 --target-version py39 .
3
+ python -m isort .
4
+
5
+ quality:
6
+ python -m black --check --line-length 119 --target-version py39 .
7
+ python -m isort --check-only .
8
+ python -m flake8 --max-line-length 119
app.py CHANGED
@@ -1,26 +1,27 @@
1
  import json
2
  import os
3
  import shutil
 
4
  from datetime import datetime
5
  from pathlib import Path
6
 
7
  import jsonlines
8
  import streamlit as st
9
  from dotenv import load_dotenv
10
- from huggingface_hub import HfApi, Repository, cached_download, hf_hub_url
11
 
12
- from utils import http_post, validate_json
13
 
14
  if Path(".env").is_file():
15
  load_dotenv(".env")
16
 
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
- AUTONLP_USERNAME = os.getenv("AUTONLP_USERNAME")
19
- HF_AUTONLP_BACKEND_API = os.getenv("HF_AUTONLP_BACKEND_API")
20
  LOCAL_REPO = "submission_repo"
21
  LOGS_REPO = "submission-logs"
22
 
23
- ## TODO ##
24
  # 1. Add check that fields are nested under `tasks` field correctly
25
  # 2. Add check that names of tasks and datasets are valid
26
 
@@ -68,9 +69,9 @@ def get_submission_names():
68
  return [score["submission_name"] for score in scores_data]
69
 
70
 
71
- ###########
72
- ### APP ###
73
- ###########
74
  st.title("GEM Submissions")
75
  st.markdown(
76
  """
@@ -144,8 +145,7 @@ with st.form(key="form"):
144
  example_submission = json.load(f)
145
  st.json(example_submission)
146
 
147
- user_name = st.text_input("Enter your πŸ€— Hub username.")
148
-
149
  submit_button = st.form_submit_button("Make Submission")
150
 
151
  if submit_button and submission_errors == 0:
@@ -155,8 +155,8 @@ if submit_button and submission_errors == 0:
155
  submission_time = str(int(datetime.now().timestamp()))
156
 
157
  # Create submission dataset under benchmarks ORG
158
- submission_repo_id = f"{user_name}__{submission_name_formatted}__{submission_time}"
159
- dataset_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{submission_repo_id}"
160
  repo = Repository(
161
  local_dir=LOCAL_REPO,
162
  clone_from=dataset_repo_url,
@@ -176,22 +176,72 @@ if submit_button and submission_errors == 0:
176
  else:
177
  commit_sha = repo.git_head_commit_url().split("/")[-1]
178
 
179
- submission_id = submission_name + "__" + commit_sha + "__" + submission_time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
 
181
  payload = {
182
- "username": AUTONLP_USERNAME,
183
- "dataset": "GEM/references",
184
- "task": 1,
185
- "model": "gem",
186
- "submission_dataset": f"GEM-submissions/{submission_repo_id}",
187
- "submission_id": submission_id,
188
- "col_mapping": {},
189
- "split": "test",
190
- "config": None,
191
  }
192
- json_resp = http_post(
193
- path="/evaluate/create", payload=payload, token=HF_TOKEN, domain=HF_AUTONLP_BACKEND_API
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  ).json()
 
195
 
196
  logs_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{LOGS_REPO}"
197
  logs_repo = Repository(
@@ -201,25 +251,29 @@ if submit_button and submission_errors == 0:
201
  private=True,
202
  use_auth_token=HF_TOKEN,
203
  )
204
- json_resp["submission_name"] = submission_name
 
 
 
 
205
  with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
206
  lines = []
207
  for obj in r:
208
  lines.append(obj)
209
 
210
- lines.append(json_resp)
211
  with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
212
  for job in lines:
213
  writer.write(job)
214
- logs_repo.push_to_hub(commit_message=f"Submission with job ID {json_resp['id']}")
215
 
216
- if json_resp["status"] == 1:
217
  st.success(
218
- f"βœ… Submission {submission_name} was successfully submitted for evaluation with job ID {json_resp['id']}"
219
  )
220
  st.markdown(
221
  f"""
222
- Evaluation takes appoximately 1-2 hours to complete, so grab a β˜• or 🍡 while you wait:
223
 
224
  * πŸ“Š Click [here](https://huggingface.co/spaces/GEM/results) to view the results from your submission
225
  * πŸ’Ύ Click [here]({dataset_repo_url}) to view your submission file on the Hugging Face Hub
 
1
  import json
2
  import os
3
  import shutil
4
+ import uuid
5
  from datetime import datetime
6
  from pathlib import Path
7
 
8
  import jsonlines
9
  import streamlit as st
10
  from dotenv import load_dotenv
11
+ from huggingface_hub import Repository, cached_download, hf_hub_url
12
 
13
+ from utils import http_get, http_post, validate_json
14
 
15
  if Path(".env").is_file():
16
  load_dotenv(".env")
17
 
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
+ AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
20
+ AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
21
  LOCAL_REPO = "submission_repo"
22
  LOGS_REPO = "submission-logs"
23
 
24
+ # TODO
25
  # 1. Add check that fields are nested under `tasks` field correctly
26
  # 2. Add check that names of tasks and datasets are valid
27
 
 
69
  return [score["submission_name"] for score in scores_data]
70
 
71
 
72
+ #######
73
+ # APP #
74
+ #######
75
  st.title("GEM Submissions")
76
  st.markdown(
77
  """
 
145
  example_submission = json.load(f)
146
  st.json(example_submission)
147
 
148
+ user_name = st.text_input("Enter your πŸ€— Hub username", help="This field is required to track your submission and cannot be empty")
 
149
  submit_button = st.form_submit_button("Make Submission")
150
 
151
  if submit_button and submission_errors == 0:
 
155
  submission_time = str(int(datetime.now().timestamp()))
156
 
157
  # Create submission dataset under benchmarks ORG
158
+ submission_repo_id = f"GEM-submissions/{user_name}__{submission_name_formatted}__{submission_time}"
159
+ dataset_repo_url = f"https://huggingface.co/datasets/{submission_repo_id}"
160
  repo = Repository(
161
  local_dir=LOCAL_REPO,
162
  clone_from=dataset_repo_url,
 
176
  else:
177
  commit_sha = repo.git_head_commit_url().split("/")[-1]
178
 
179
+ submission_id = submission_name + "__" + str(uuid.uuid4())[:6] + "__" + submission_time
180
+
181
+ # Define AutoTrain payload
182
+ project_config = {}
183
+ # Need a dummy dataset to use the dataset loader in AutoTrain
184
+ project_config["dataset_name"] = "lewtun/imdb-dummy"
185
+ project_config["dataset_config"] = "lewtun--imdb-dummy"
186
+ project_config["dataset_split"] = "train"
187
+ project_config["col_mapping"] = {"text": "text", "label": "target"}
188
+ # Specify benchmark parameters
189
+ project_config["model"] = "gem"
190
+ project_config["dataset"] = "GEM/references"
191
+ project_config["submission_dataset"] = submission_repo_id
192
+ project_id = str(uuid.uuid4()).split("-")[0]
193
+ project_payload = {
194
+ "username": AUTOTRAIN_USERNAME,
195
+ "proj_name": f"benchmark-gem-{project_id}",
196
+ "task": 1,
197
+ "config": {
198
+ "language": "en",
199
+ "max_models": 5,
200
+ "instance": {
201
+ "provider": "aws",
202
+ "instance_type": "ml.g4dn.4xlarge",
203
+ "max_runtime_seconds": 172800,
204
+ "num_instances": 1,
205
+ "disk_size_gb": 150,
206
+ },
207
+ "benchmark": {
208
+ "dataset": project_config["dataset"],
209
+ "model": project_config["model"],
210
+ "submission_dataset": project_config["submission_dataset"],
211
+ },
212
+ },
213
+ }
214
+ project_json_resp = http_post(
215
+ path="/projects/create", payload=project_payload, token=HF_TOKEN, domain=AUTOTRAIN_BACKEND_API
216
+ ).json()
217
+ print(f"Project creation: {project_json_resp}")
218
 
219
+ # Upload data
220
  payload = {
221
+ "split": 4,
222
+ "col_mapping": project_config["col_mapping"],
223
+ "load_config": {"max_size_bytes": 0, "shuffle": False},
 
 
 
 
 
 
224
  }
225
+ data_json_resp = http_post(
226
+ path=f"/projects/{project_json_resp['id']}/data/{project_config['dataset_name']}",
227
+ payload=payload,
228
+ token=HF_TOKEN,
229
+ domain=AUTOTRAIN_BACKEND_API,
230
+ params={
231
+ "type": "dataset",
232
+ "config_name": project_config["dataset_config"],
233
+ "split_name": project_config["dataset_split"],
234
+ },
235
+ ).json()
236
+ print(f"Dataset creation: {data_json_resp}")
237
+
238
+ # Run training
239
+ train_json_resp = http_get(
240
+ path=f"/projects/{project_json_resp['id']}/data/start_process",
241
+ token=HF_TOKEN,
242
+ domain=AUTOTRAIN_BACKEND_API,
243
  ).json()
244
+ print(f"Training job response: {train_json_resp}")
245
 
246
  logs_repo_url = f"https://huggingface.co/datasets/GEM-submissions/{LOGS_REPO}"
247
  logs_repo = Repository(
 
251
  private=True,
252
  use_auth_token=HF_TOKEN,
253
  )
254
+ evaluation_log = {}
255
+ evaluation_log["payload"] = project_payload
256
+ evaluation_log["project_creation_response"] = project_json_resp
257
+ evaluation_log["dataset_creation_response"] = data_json_resp
258
+ evaluation_log["autotrain_job_response"] = train_json_resp
259
  with jsonlines.open(f"{LOGS_REPO}/logs.jsonl") as r:
260
  lines = []
261
  for obj in r:
262
  lines.append(obj)
263
 
264
+ lines.append(evaluation_log)
265
  with jsonlines.open(f"{LOGS_REPO}/logs.jsonl", mode="w") as writer:
266
  for job in lines:
267
  writer.write(job)
268
+ logs_repo.push_to_hub(commit_message=f"Submission with job ID {project_json_resp['id']}")
269
 
270
+ if train_json_resp["success"] == 1:
271
  st.success(
272
+ f"βœ… Submission {submission_name} was successfully submitted for evaluation!"
273
  )
274
  st.markdown(
275
  f"""
276
+ Evaluation can take up to 1 hour to complete, so grab a β˜• or 🍡 while you wait:
277
 
278
  * πŸ“Š Click [here](https://huggingface.co/spaces/GEM/results) to view the results from your submission
279
  * πŸ’Ύ Click [here]({dataset_repo_url}) to view your submission file on the Hugging Face Hub
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  python-dotenv
2
- huggingface-hub==0.2.1
3
  jsonlines
 
1
  python-dotenv
2
+ huggingface-hub==0.8.1
3
  jsonlines
utils.py CHANGED
@@ -2,7 +2,6 @@ import json
2
 
3
  import jsonschema
4
  import requests
5
- import streamlit as st
6
 
7
 
8
  def load_schema():
@@ -28,17 +27,26 @@ def get_auth_headers(token: str, prefix: str = "autonlp"):
28
  return {"Authorization": f"{prefix} {token}"}
29
 
30
 
31
- def http_post(
 
 
 
 
 
 
 
 
 
 
 
 
32
  path: str,
33
  token: str,
34
- payload=None,
35
  domain: str = None,
36
  ) -> requests.Response:
37
  """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
38
  try:
39
- response = requests.post(
40
- url=domain + path, json=payload, headers=get_auth_headers(token=token), allow_redirects=True
41
- )
42
  except requests.exceptions.ConnectionError:
43
  print("❌ Failed to reach AutoNLP API, check your internet connection")
44
  response.raise_for_status()
 
2
 
3
  import jsonschema
4
  import requests
 
5
 
6
 
7
  def load_schema():
 
27
  return {"Authorization": f"{prefix} {token}"}
28
 
29
 
30
+ def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response:
31
+ """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
32
+ try:
33
+ response = requests.post(
34
+ url=domain + path, json=payload, headers=get_auth_headers(token=token), allow_redirects=True, params=params
35
+ )
36
+ except requests.exceptions.ConnectionError:
37
+ print("❌ Failed to reach AutoNLP API, check your internet connection")
38
+ response.raise_for_status()
39
+ return response
40
+
41
+
42
+ def http_get(
43
  path: str,
44
  token: str,
 
45
  domain: str = None,
46
  ) -> requests.Response:
47
  """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
48
  try:
49
+ response = requests.get(url=domain + path, headers=get_auth_headers(token=token), allow_redirects=True)
 
 
50
  except requests.exceptions.ConnectionError:
51
  print("❌ Failed to reach AutoNLP API, check your internet connection")
52
  response.raise_for_status()