inoki-giskard commited on
Commit
4641c89
2 Parent(s): 35be7f4 ed3fe33

Merge branch 'feature/gsk-2457-secure-scanner-running-based-on-virtualenv' into giskard-main

Browse files
app_env.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ HF_REPO_ID = "HF_REPO_ID"
3
+ HF_SPACE_ID = "SPACE_ID"
4
+ HF_WRITE_TOKEN = "HF_WRITE_TOKEN"
5
+ HF_GSK_HUB_URL = "GSK_HUB_URL"
6
+ HF_GSK_HUB_PROJECT_KEY = "GSK_HUB_PROJECT_KEY"
7
+ HF_GSK_HUB_KEY = "GSK_API_KEY"
8
+ HF_GSK_HUB_HF_TOKEN = "GSK_HF_TOKEN"
9
+ HF_GSK_HUB_UNLOCK_TOKEN = "GSK_HUB_UNLOCK_TOKEN"
app_leaderboard.py CHANGED
@@ -5,7 +5,6 @@ import gradio as gr
5
 
6
  from fetch_utils import (check_dataset_and_get_config,
7
  check_dataset_and_get_split)
8
- from text_classification_ui_helpers import LEADERBOARD
9
 
10
  import leaderboard
11
 
@@ -75,7 +74,7 @@ def get_display_df(df):
75
 
76
 
77
  def get_demo():
78
- leaderboard.records = get_records_from_dataset_repo(LEADERBOARD)
79
  records = leaderboard.records
80
 
81
  model_ids = get_model_ids(records)
 
5
 
6
  from fetch_utils import (check_dataset_and_get_config,
7
  check_dataset_and_get_split)
 
8
 
9
  import leaderboard
10
 
 
74
 
75
 
76
  def get_demo():
77
+ leaderboard.records = get_records_from_dataset_repo(leaderboard.LEADERBOARD)
78
  records = leaderboard.records
79
 
80
  model_ids = get_model_ids(records)
io_utils.py CHANGED
@@ -1,11 +1,7 @@
1
  import os
2
- from pathlib import Path
3
- import subprocess
4
 
5
  import yaml
6
 
7
- import pipe
8
-
9
  YAML_PATH = "./cicd/configs"
10
  LOG_FILE = "temp_log"
11
 
@@ -104,6 +100,15 @@ def convert_column_mapping_to_json(df, label=""):
104
  return column_mapping
105
 
106
 
 
 
 
 
 
 
 
 
 
107
  def get_logs_file():
108
  try:
109
  with open(LOG_FILE, "r") as file:
@@ -115,29 +120,3 @@ def get_logs_file():
115
  def write_log_to_user_file(task_id, log):
116
  with open(f"./tmp/{task_id}.log", "a") as f:
117
  f.write(log)
118
-
119
-
120
- def save_job_to_pipe(task_id, job, description, lock):
121
- with lock:
122
- pipe.jobs.append((task_id, job, description))
123
-
124
-
125
- def pop_job_from_pipe():
126
- if len(pipe.jobs) == 0:
127
- return
128
- job_info = pipe.jobs.pop()
129
- pipe.current = job_info[2]
130
- task_id = job_info[0]
131
- write_log_to_user_file(task_id, f"Running job id {task_id}\n")
132
- command = job_info[1]
133
-
134
- # Link to LOG_FILE
135
- log_file_path = Path(LOG_FILE)
136
- if log_file_path.exists():
137
- log_file_path.unlink()
138
- os.symlink(f"./tmp/{task_id}.log", LOG_FILE)
139
-
140
- with open(f"./tmp/{task_id}.log", "a") as log_file:
141
- p = subprocess.Popen(command, stdout=log_file, stderr=subprocess.STDOUT)
142
- p.wait()
143
- pipe.current = None
 
1
  import os
 
 
2
 
3
  import yaml
4
 
 
 
5
  YAML_PATH = "./cicd/configs"
6
  LOG_FILE = "temp_log"
7
 
 
100
  return column_mapping
101
 
102
 
103
+ def get_log_file_with_uid(uid):
104
+ try:
105
+ print(f"Loading {uid}.log")
106
+ with open(f"./tmp/{uid}.log", "a") as file:
107
+ return file.read()
108
+ except Exception:
109
+ return "Log file does not exist"
110
+
111
+
112
  def get_logs_file():
113
  try:
114
  with open(LOG_FILE, "r") as file:
 
120
  def write_log_to_user_file(task_id, log):
121
  with open(f"./tmp/{task_id}.log", "a") as f:
122
  f.write(log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
isolated_env.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+
4
+ from io_utils import write_log_to_user_file
5
+
6
+
7
+ def prepare_venv(execution_id, deps):
8
+ python_executable = "python"
9
+ venv_base = f"tmp/venvs/{execution_id}"
10
+
11
+ pip_executable = os.path.join(venv_base, "bin", "pip")
12
+ # Check pyver
13
+ write_log_to_user_file(execution_id, "Checking Python version\n")
14
+ p = subprocess.run([python_executable, "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
15
+ write_log_to_user_file(execution_id, p.stdout.decode())
16
+ if p.returncode != 0:
17
+ raise RuntimeError(f"{p.args} ended with {p.returncode}")
18
+ # Create venv
19
+ write_log_to_user_file(execution_id, "Creating virtual environment\n")
20
+ p = subprocess.run([python_executable, "-m", "venv", venv_base, "--clear"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
21
+ write_log_to_user_file(execution_id, p.stdout.decode())
22
+ if p.returncode != 0:
23
+ raise RuntimeError(f"{p.args} ended with {p.returncode}")
24
+ # Output requirements.txt
25
+ requirement_file = os.path.join(venv_base, "requirements.txt")
26
+ with open(requirement_file, "w") as f:
27
+ f.writelines(deps)
28
+ # Install deps
29
+ write_log_to_user_file(execution_id, "Installing dependencies\n")
30
+ p = subprocess.run([pip_executable, "install", "-r", requirement_file], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
31
+ write_log_to_user_file(execution_id, p.stdout.decode())
32
+ if p.returncode != 0:
33
+ raise RuntimeError(f"{p.args} ended with {p.returncode}")
34
+ return os.path.join(venv_base, "bin", "giskard_scanner")
leaderboard.py CHANGED
@@ -1,3 +1,5 @@
1
  import pandas as pd
2
 
3
- records = pd.DataFrame()
 
 
 
1
  import pandas as pd
2
 
3
+ records = pd.DataFrame()
4
+
5
+ LEADERBOARD = "giskard-bot/evaluator-leaderboard"
run_jobs.py CHANGED
@@ -1,11 +1,30 @@
 
1
  import logging
 
 
2
  import threading
3
  import time
 
4
 
5
- from io_utils import pop_job_from_pipe
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  is_running = False
8
 
 
 
9
 
10
  def start_process_run_job():
11
  try:
@@ -26,6 +45,134 @@ def stop_thread():
26
  is_running = False
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def run_job():
30
  global is_running
31
  while is_running:
 
1
+ import json
2
  import logging
3
+ import os
4
+ import subprocess
5
  import threading
6
  import time
7
+ from pathlib import Path
8
 
9
+ import pipe
10
+ from app_env import (
11
+ HF_GSK_HUB_HF_TOKEN,
12
+ HF_GSK_HUB_KEY,
13
+ HF_GSK_HUB_PROJECT_KEY,
14
+ HF_GSK_HUB_UNLOCK_TOKEN,
15
+ HF_GSK_HUB_URL,
16
+ HF_REPO_ID,
17
+ HF_SPACE_ID,
18
+ HF_WRITE_TOKEN,
19
+ )
20
+ from io_utils import LOG_FILE, get_yaml_path, write_log_to_user_file
21
+ from isolated_env import prepare_venv
22
+ from leaderboard import LEADERBOARD
23
 
24
  is_running = False
25
 
26
+ logger = logging.getLogger(__file__)
27
+
28
 
29
  def start_process_run_job():
30
  try:
 
45
  is_running = False
46
 
47
 
48
+ def prepare_env_and_get_command(
49
+ m_id,
50
+ d_id,
51
+ config,
52
+ split,
53
+ inference,
54
+ inference_token,
55
+ uid,
56
+ label_mapping,
57
+ feature_mapping,
58
+ ):
59
+ leaderboard_dataset = None
60
+ if os.environ.get("SPACE_ID") == "giskardai/giskard-evaluator":
61
+ leaderboard_dataset = LEADERBOARD
62
+
63
+ inference_type = "hf_pipeline"
64
+ if inference and inference_token:
65
+ inference_type = "hf_inference_api"
66
+
67
+ executable = "giskard_scanner"
68
+ try:
69
+ # Copy the current requirements (might be changed)
70
+ with open("requirements.txt", "r") as f:
71
+ executable = prepare_venv(
72
+ uid,
73
+ "\n".join(f.readlines()),
74
+ )
75
+ logger.info(f"Using {executable} as executable")
76
+ except Exception as e:
77
+ logger.warn(f"Create env failed due to {e}, using the current env as fallback.")
78
+ executable = "giskard_scanner"
79
+
80
+ command = [
81
+ executable,
82
+ "--loader",
83
+ "huggingface",
84
+ "--model",
85
+ m_id,
86
+ "--dataset",
87
+ d_id,
88
+ "--dataset_config",
89
+ config,
90
+ "--dataset_split",
91
+ split,
92
+ "--output_format",
93
+ "markdown",
94
+ "--output_portal",
95
+ "huggingface",
96
+ "--feature_mapping",
97
+ json.dumps(feature_mapping),
98
+ "--label_mapping",
99
+ json.dumps(label_mapping),
100
+ "--scan_config",
101
+ get_yaml_path(uid),
102
+ "--inference_type",
103
+ inference_type,
104
+ "--inference_api_token",
105
+ inference_token,
106
+ ]
107
+ # The token to publish post
108
+ if os.environ.get(HF_WRITE_TOKEN):
109
+ command.append("--hf_token")
110
+ command.append(os.environ.get(HF_WRITE_TOKEN))
111
+
112
+ # The repo to publish post
113
+ if os.environ.get(HF_REPO_ID) or os.environ.get(HF_SPACE_ID):
114
+ command.append("--discussion_repo")
115
+ # TODO: Replace by the model id
116
+ command.append(os.environ.get(HF_REPO_ID) or os.environ.get(HF_SPACE_ID))
117
+
118
+ # The repo to publish for ranking
119
+ if leaderboard_dataset:
120
+ command.append("--leaderboard_dataset")
121
+ command.append(leaderboard_dataset)
122
+
123
+ # The info to upload to Giskard hub
124
+ if os.environ.get(HF_GSK_HUB_KEY):
125
+ command.append("--giskard_hub_api_key")
126
+ command.append(os.environ.get(HF_GSK_HUB_KEY))
127
+ if os.environ.get(HF_GSK_HUB_URL):
128
+ command.append("--giskard_hub_url")
129
+ command.append(os.environ.get(HF_GSK_HUB_URL))
130
+ if os.environ.get(HF_GSK_HUB_PROJECT_KEY):
131
+ command.append("--giskard_hub_project_key")
132
+ command.append(os.environ.get(HF_GSK_HUB_PROJECT_KEY))
133
+ if os.environ.get(HF_GSK_HUB_HF_TOKEN):
134
+ command.append("--giskard_hub_hf_token")
135
+ command.append(os.environ.get(HF_GSK_HUB_HF_TOKEN))
136
+ if os.environ.get(HF_GSK_HUB_UNLOCK_TOKEN):
137
+ command.append("--giskard_hub_unlock_token")
138
+ command.append(os.environ.get(HF_GSK_HUB_UNLOCK_TOKEN))
139
+
140
+ eval_str = f"[{m_id}]<{d_id}({config}, {split} set)>"
141
+
142
+ write_log_to_user_file(
143
+ uid,
144
+ f"Start local evaluation on {eval_str}. Please wait for your job to start...\n",
145
+ )
146
+
147
+ return command
148
+
149
+
150
+ def save_job_to_pipe(task_id, job, description, lock):
151
+ with lock:
152
+ pipe.jobs.append((task_id, job, description))
153
+
154
+
155
+ def pop_job_from_pipe():
156
+ if len(pipe.jobs) == 0:
157
+ return
158
+ job_info = pipe.jobs.pop()
159
+ pipe.current = job_info[2]
160
+ task_id = job_info[0]
161
+ write_log_to_user_file(task_id, f"Running job id {task_id}\n")
162
+ command = prepare_env_and_get_command(*job_info[1])
163
+
164
+ # Link to LOG_FILE
165
+ log_file_path = Path(LOG_FILE)
166
+ if log_file_path.exists():
167
+ log_file_path.unlink()
168
+ os.symlink(f"./tmp/{task_id}.log", LOG_FILE)
169
+
170
+ with open(f"./tmp/{task_id}.log", "a") as log_file:
171
+ p = subprocess.Popen(command, stdout=log_file, stderr=subprocess.STDOUT)
172
+ p.wait()
173
+ pipe.current = None
174
+
175
+
176
  def run_job():
177
  global is_running
178
  while is_running:
text_classification_ui_helpers.py CHANGED
@@ -1,22 +1,15 @@
1
  import collections
2
- import json
3
  import logging
4
- import os
5
  import threading
6
  import uuid
7
- import leaderboard
8
 
9
  import datasets
10
  import gradio as gr
11
  import pandas as pd
12
 
13
- from io_utils import (
14
- get_yaml_path,
15
- read_column_mapping,
16
- save_job_to_pipe,
17
- write_column_mapping,
18
- write_log_to_user_file,
19
- )
20
  from text_classification import (
21
  check_model_task,
22
  get_example_prediction,
@@ -32,21 +25,10 @@ from wordings import (
32
  MAX_LABELS = 40
33
  MAX_FEATURES = 20
34
 
35
- HF_REPO_ID = "HF_REPO_ID"
36
- HF_SPACE_ID = "SPACE_ID"
37
- HF_WRITE_TOKEN = "HF_WRITE_TOKEN"
38
- HF_GSK_HUB_URL = "GSK_HUB_URL"
39
- HF_GSK_HUB_PROJECT_KEY = "GSK_HUB_PROJECT_KEY"
40
- HF_GSK_HUB_KEY = "GSK_API_KEY"
41
- HF_GSK_HUB_HF_TOKEN = "GSK_HF_TOKEN"
42
- HF_GSK_HUB_UNLOCK_TOKEN = "GSK_HUB_UNLOCK_TOKEN"
43
-
44
- LEADERBOARD = "giskard-bot/evaluator-leaderboard"
45
-
46
- global ds_dict, ds_config
47
  ds_dict = None
48
  ds_config = None
49
 
 
50
  def get_related_datasets_from_leaderboard(model_id):
51
  records = leaderboard.records
52
  model_records = records[records["model_id"] == model_id]
@@ -203,7 +185,13 @@ def precheck_model_ds_enable_example_btn(
203
 
204
 
205
  def align_columns_and_show_prediction(
206
- model_id, dataset_id, dataset_config, dataset_split, uid, run_inference, inference_token
 
 
 
 
 
 
207
  ):
208
  model_task = check_model_task(model_id)
209
  if model_task is None or model_task != "text-classification":
@@ -303,85 +291,24 @@ def try_submit(m_id, d_id, config, split, inference, inference_token, uid):
303
  check_column_mapping_keys_validity(all_mappings)
304
  label_mapping, feature_mapping = construct_label_and_feature_mapping(all_mappings)
305
 
306
- leaderboard_dataset = None
307
- if os.environ.get("SPACE_ID") == "giskardai/giskard-evaluator":
308
- leaderboard_dataset = LEADERBOARD
309
-
310
- if inference:
311
- inference_type = "hf_inference_api"
312
-
313
-
314
- # TODO: Set column mapping for some dataset such as `amazon_polarity`
315
- command = [
316
- "giskard_scanner",
317
- "--loader",
318
- "huggingface",
319
- "--model",
320
- m_id,
321
- "--dataset",
322
- d_id,
323
- "--dataset_config",
324
- config,
325
- "--dataset_split",
326
- split,
327
- "--output_format",
328
- "markdown",
329
- "--output_portal",
330
- "huggingface",
331
- "--feature_mapping",
332
- json.dumps(feature_mapping),
333
- "--label_mapping",
334
- json.dumps(label_mapping),
335
- "--scan_config",
336
- get_yaml_path(uid),
337
- "--inference_type",
338
- inference_type,
339
- "--inference_api_token",
340
- inference_token,
341
- ]
342
-
343
- # The token to publish post
344
- if os.environ.get(HF_WRITE_TOKEN):
345
- command.append("--hf_token")
346
- command.append(os.environ.get(HF_WRITE_TOKEN))
347
-
348
- # The repo to publish post
349
- if os.environ.get(HF_REPO_ID) or os.environ.get(HF_SPACE_ID):
350
- command.append("--discussion_repo")
351
- # TODO: Replace by the model id
352
- command.append(os.environ.get(HF_REPO_ID) or os.environ.get(HF_SPACE_ID))
353
-
354
- # The repo to publish for ranking
355
- if leaderboard_dataset:
356
- command.append("--leaderboard_dataset")
357
- command.append(leaderboard_dataset)
358
-
359
- # The info to upload to Giskard hub
360
- if os.environ.get(HF_GSK_HUB_KEY):
361
- command.append("--giskard_hub_api_key")
362
- command.append(os.environ.get(HF_GSK_HUB_KEY))
363
- if os.environ.get(HF_GSK_HUB_URL):
364
- command.append("--giskard_hub_url")
365
- command.append(os.environ.get(HF_GSK_HUB_URL))
366
- if os.environ.get(HF_GSK_HUB_PROJECT_KEY):
367
- command.append("--giskard_hub_project_key")
368
- command.append(os.environ.get(HF_GSK_HUB_PROJECT_KEY))
369
- if os.environ.get(HF_GSK_HUB_HF_TOKEN):
370
- command.append("--giskard_hub_hf_token")
371
- command.append(os.environ.get(HF_GSK_HUB_HF_TOKEN))
372
- if os.environ.get(HF_GSK_HUB_UNLOCK_TOKEN):
373
- command.append("--giskard_hub_unlock_token")
374
- command.append(os.environ.get(HF_GSK_HUB_UNLOCK_TOKEN))
375
-
376
  eval_str = f"[{m_id}]<{d_id}({config}, {split} set)>"
377
- logging.info(f"Start local evaluation on {eval_str}")
378
- save_job_to_pipe(uid, command, eval_str, threading.Lock())
379
-
380
- write_log_to_user_file(
381
  uid,
382
- f"Start local evaluation on {eval_str}. Please wait for your job to start...\n",
 
 
 
 
 
 
 
 
 
 
 
 
383
  )
384
- gr.Info(f"Start local evaluation on {eval_str}")
385
 
386
  return (
387
  gr.update(interactive=False), # Submit button
 
1
  import collections
 
2
  import logging
 
3
  import threading
4
  import uuid
 
5
 
6
  import datasets
7
  import gradio as gr
8
  import pandas as pd
9
 
10
+ import leaderboard
11
+ from io_utils import read_column_mapping, write_column_mapping
12
+ from run_jobs import save_job_to_pipe
 
 
 
 
13
  from text_classification import (
14
  check_model_task,
15
  get_example_prediction,
 
25
  MAX_LABELS = 40
26
  MAX_FEATURES = 20
27
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  ds_dict = None
29
  ds_config = None
30
 
31
+
32
  def get_related_datasets_from_leaderboard(model_id):
33
  records = leaderboard.records
34
  model_records = records[records["model_id"] == model_id]
 
185
 
186
 
187
  def align_columns_and_show_prediction(
188
+ model_id,
189
+ dataset_id,
190
+ dataset_config,
191
+ dataset_split,
192
+ uid,
193
+ run_inference,
194
+ inference_token,
195
  ):
196
  model_task = check_model_task(model_id)
197
  if model_task is None or model_task != "text-classification":
 
291
  check_column_mapping_keys_validity(all_mappings)
292
  label_mapping, feature_mapping = construct_label_and_feature_mapping(all_mappings)
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  eval_str = f"[{m_id}]<{d_id}({config}, {split} set)>"
295
+ save_job_to_pipe(
 
 
 
296
  uid,
297
+ (
298
+ m_id,
299
+ d_id,
300
+ config,
301
+ split,
302
+ inference,
303
+ inference_token,
304
+ uid,
305
+ label_mapping,
306
+ feature_mapping,
307
+ ),
308
+ eval_str,
309
+ threading.Lock(),
310
  )
311
+ gr.Info("Your evaluation is submitted")
312
 
313
  return (
314
  gr.update(interactive=False), # Submit button
tmp/venvs/.gitkeep ADDED
File without changes