ZeroCommand commited on
Commit
63bdb5b
1 Parent(s): 5b24f7d

GSK-2396 allow edit feature mapping and scan config

Browse files
Files changed (4) hide show
  1. app.py +46 -31
  2. scan_config.yaml +8 -0
  3. text_classification.py +33 -23
  4. utils.py +24 -0
app.py CHANGED
@@ -11,13 +11,12 @@ import json
11
  from transformers.pipelines import TextClassificationPipeline
12
 
13
  from text_classification import check_column_mapping_keys_validity, text_classification_fix_column_mapping
14
-
15
 
16
  HF_REPO_ID = 'HF_REPO_ID'
17
  HF_SPACE_ID = 'SPACE_ID'
18
  HF_WRITE_TOKEN = 'HF_WRITE_TOKEN'
19
 
20
-
21
  theme = gr.themes.Soft(
22
  primary_hue="green",
23
  )
@@ -70,6 +69,7 @@ def try_validate(m_id, ppl, dataset_id, dataset_config, dataset_split, column_ma
70
  gr.update(visible=False), # Model prediction input
71
  gr.update(visible=False), # Model prediction preview
72
  gr.update(visible=False), # Label mapping preview
 
73
  )
74
  if isinstance(ppl, Exception):
75
  gr.Warning(f'Failed to load model": {ppl}')
@@ -80,6 +80,7 @@ def try_validate(m_id, ppl, dataset_id, dataset_config, dataset_split, column_ma
80
  gr.update(visible=False), # Model prediction input
81
  gr.update(visible=False), # Model prediction preview
82
  gr.update(visible=False), # Label mapping preview
 
83
  )
84
 
85
  # Validate dataset
@@ -105,7 +106,7 @@ def try_validate(m_id, ppl, dataset_id, dataset_config, dataset_split, column_ma
105
  gr.update(visible=False), # Model prediction input
106
  gr.update(visible=False), # Model prediction preview
107
  gr.update(visible=False), # Label mapping preview
108
- # gr.update(visible=True), # Column mapping
109
  )
110
 
111
  # TODO: Validate column mapping by running once
@@ -118,21 +119,21 @@ def try_validate(m_id, ppl, dataset_id, dataset_config, dataset_split, column_ma
118
  except Exception:
119
  column_mapping = {}
120
 
121
- column_mapping, prediction_input, prediction_result, id2label_df = \
122
  text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split)
123
 
124
  column_mapping = json.dumps(column_mapping, indent=2)
125
 
126
- if prediction_result is None:
127
  gr.Warning('The model failed to predict with the first row in the dataset. Please provide column mappings in "Advance" settings.')
128
  return (
129
  gr.update(interactive=False), # Submit button
130
- gr.update(visible=True), # Loading row
131
- gr.update(visible=False), # Preview row
132
- gr.update(visible=False), # Model prediction input
133
  gr.update(visible=False), # Model prediction preview
134
- gr.update(visible=False), # Label mapping preview
135
- # gr.update(value=column_mapping, visible=True, interactive=True), # Column mapping
136
  )
137
  elif id2label_df is None:
138
  gr.Warning('The prediction result does not conform the labels in the dataset. Please provide label mappings in "Advance" settings.')
@@ -142,8 +143,8 @@ def try_validate(m_id, ppl, dataset_id, dataset_config, dataset_split, column_ma
142
  gr.update(visible=True), # Preview row
143
  gr.update(value=f'**Sample Input**: {prediction_input}', visible=True), # Model prediction input
144
  gr.update(value=prediction_result, visible=True), # Model prediction preview
145
- gr.update(visible=False), # Label mapping preview
146
- # gr.update(value=column_mapping, visible=True, interactive=True), # Column mapping
147
  )
148
 
149
  gr.Info("Model and dataset validations passed. Your can submit the evaluation task.")
@@ -155,6 +156,7 @@ def try_validate(m_id, ppl, dataset_id, dataset_config, dataset_split, column_ma
155
  gr.update(value=f'**Sample Input**: {prediction_input}', visible=True), # Model prediction input
156
  gr.update(value=prediction_result, visible=True), # Model prediction preview
157
  gr.update(value=id2label_df, visible=True, interactive=True), # Label mapping preview
 
158
  )
159
 
160
 
@@ -180,6 +182,7 @@ def try_submit(m_id, d_id, config, split, column_mappings, local):
180
  "--output_portal", "huggingface",
181
  # TODO: "--feature_mapping", json.dumps(column_mapping),
182
  "--label_mapping", json.dumps(label_mapping),
 
183
  ]
184
 
185
  eval_str = f"[{m_id}]<{d_id}({config}, {split} set)>"
@@ -221,12 +224,14 @@ with gr.Blocks(theme=theme) as iface:
221
  gr.Warning(f"Failed to load dataset {dataset_id} with config {dataset_config}: {e}")
222
  pass
223
 
224
- def gate_validate_btn(model_id, dataset_id, dataset_config, dataset_split, id2label_mapping_dataframe=None):
225
  column_mapping = '{}'
226
- m_id, ppl = check_model(model_id=model_id)
227
-
228
  if id2label_mapping_dataframe is not None:
229
- column_mapping = id2label_mapping_dataframe.to_json(orient="split")
 
 
 
230
  if check_column_mapping_keys_validity(column_mapping, ppl) is False:
231
  gr.Warning('Label mapping table has invalid contents. Please check again.')
232
  return (gr.update(interactive=False),
@@ -234,18 +239,18 @@ with gr.Blocks(theme=theme) as iface:
234
  gr.update(),
235
  gr.update(),
236
  gr.update(),
 
237
  gr.update())
238
  else:
239
  if model_id and dataset_id and dataset_config and dataset_split:
240
- return try_validate(m_id, ppl, dataset_id, dataset_config, dataset_split, column_mapping)
241
  else:
242
- del ppl
243
-
244
  return (gr.update(interactive=False),
245
  gr.update(visible=True),
246
  gr.update(visible=False),
247
  gr.update(visible=False),
248
  gr.update(visible=False),
 
249
  gr.update(visible=False))
250
  with gr.Row():
251
  gr.Markdown('''
@@ -256,6 +261,12 @@ with gr.Blocks(theme=theme) as iface:
256
  ''')
257
  with gr.Row():
258
  run_local = gr.Checkbox(value=True, label="Run in this Space")
 
 
 
 
 
 
259
 
260
  with gr.Row():
261
  model_id_input = gr.Textbox(
@@ -279,11 +290,11 @@ with gr.Blocks(theme=theme) as iface:
279
 
280
  with gr.Row(visible=True) as loading_row:
281
  gr.Markdown('''
282
- <h1 style="text-align: center;">
283
- Please validate your model and dataset first...
284
- </h1>
285
  ''')
286
-
287
  with gr.Row(visible=False) as preview_row:
288
  gr.Markdown('''
289
  <h1 style="text-align: center;">
@@ -294,7 +305,7 @@ with gr.Blocks(theme=theme) as iface:
294
 
295
  with gr.Row():
296
  id2label_mapping_dataframe = gr.DataFrame(label="Preview of label mapping", interactive=True, visible=False)
297
-
298
  with gr.Row():
299
  example_input = gr.Markdown('Sample Input: ', visible=False)
300
 
@@ -310,20 +321,24 @@ with gr.Blocks(theme=theme) as iface:
310
 
311
  model_id_input.change(gate_validate_btn,
312
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
313
- outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe])
314
  dataset_id_input.change(gate_validate_btn,
315
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
316
- outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe])
317
  dataset_config_input.change(gate_validate_btn,
318
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
319
- outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe])
320
  dataset_split_input.change(gate_validate_btn,
321
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
322
- outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe])
323
  id2label_mapping_dataframe.input(gate_validate_btn,
324
- inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input, id2label_mapping_dataframe],
325
- outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe])
326
-
 
 
 
 
327
  run_btn.click(
328
  try_submit,
329
  inputs=[
 
11
  from transformers.pipelines import TextClassificationPipeline
12
 
13
  from text_classification import check_column_mapping_keys_validity, text_classification_fix_column_mapping
14
+ from utils import read_scanners, write_scanners, convert_column_mapping_to_json
15
 
16
  HF_REPO_ID = 'HF_REPO_ID'
17
  HF_SPACE_ID = 'SPACE_ID'
18
  HF_WRITE_TOKEN = 'HF_WRITE_TOKEN'
19
 
 
20
  theme = gr.themes.Soft(
21
  primary_hue="green",
22
  )
 
69
  gr.update(visible=False), # Model prediction input
70
  gr.update(visible=False), # Model prediction preview
71
  gr.update(visible=False), # Label mapping preview
72
+ gr.update(visible=False), # feature mapping preview
73
  )
74
  if isinstance(ppl, Exception):
75
  gr.Warning(f'Failed to load model": {ppl}')
 
80
  gr.update(visible=False), # Model prediction input
81
  gr.update(visible=False), # Model prediction preview
82
  gr.update(visible=False), # Label mapping preview
83
+ gr.update(visible=False), # feature mapping preview
84
  )
85
 
86
  # Validate dataset
 
106
  gr.update(visible=False), # Model prediction input
107
  gr.update(visible=False), # Model prediction preview
108
  gr.update(visible=False), # Label mapping preview
109
+ gr.update(visible=False), # feature mapping preview
110
  )
111
 
112
  # TODO: Validate column mapping by running once
 
119
  except Exception:
120
  column_mapping = {}
121
 
122
+ column_mapping, prediction_input, prediction_result, id2label_df, feature_df = \
123
  text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split)
124
 
125
  column_mapping = json.dumps(column_mapping, indent=2)
126
 
127
+ if prediction_result is None and id2label_df is not None:
128
  gr.Warning('The model failed to predict with the first row in the dataset. Please provide column mappings in "Advance" settings.')
129
  return (
130
  gr.update(interactive=False), # Submit button
131
+ gr.update(visible=False), # Loading row
132
+ gr.update(visible=True), # Preview row
133
+ gr.update(value=f'**Sample Input**: {prediction_input}', visible=True), # Model prediction input
134
  gr.update(visible=False), # Model prediction preview
135
+ gr.update(value=id2label_df, visible=True, interactive=True), # Label mapping preview
136
+ gr.update(value=feature_df, visible=True, interactive=True), # feature mapping preview
137
  )
138
  elif id2label_df is None:
139
  gr.Warning('The prediction result does not conform the labels in the dataset. Please provide label mappings in "Advance" settings.')
 
143
  gr.update(visible=True), # Preview row
144
  gr.update(value=f'**Sample Input**: {prediction_input}', visible=True), # Model prediction input
145
  gr.update(value=prediction_result, visible=True), # Model prediction preview
146
+ gr.update(visible=True, interactive=True), # Label mapping preview
147
+ gr.update(visible=True, interactive=True), # feature mapping preview
148
  )
149
 
150
  gr.Info("Model and dataset validations passed. Your can submit the evaluation task.")
 
156
  gr.update(value=f'**Sample Input**: {prediction_input}', visible=True), # Model prediction input
157
  gr.update(value=prediction_result, visible=True), # Model prediction preview
158
  gr.update(value=id2label_df, visible=True, interactive=True), # Label mapping preview
159
+ gr.update(value=feature_df, visible=True, interactive=True), # feature mapping preview
160
  )
161
 
162
 
 
182
  "--output_portal", "huggingface",
183
  # TODO: "--feature_mapping", json.dumps(column_mapping),
184
  "--label_mapping", json.dumps(label_mapping),
185
+ "--scan_config", "./scan_config.yaml",
186
  ]
187
 
188
  eval_str = f"[{m_id}]<{d_id}({config}, {split} set)>"
 
224
  gr.Warning(f"Failed to load dataset {dataset_id} with config {dataset_config}: {e}")
225
  pass
226
 
227
+ def gate_validate_btn(model_id, dataset_id, dataset_config, dataset_split, id2label_mapping_dataframe=None, feature_mapping_dataframe=None):
228
  column_mapping = '{}'
229
+ _, ppl = check_model(model_id=model_id)
 
230
  if id2label_mapping_dataframe is not None:
231
+ labels = convert_column_mapping_to_json(id2label_mapping_dataframe.value, label="data")
232
+ features = convert_column_mapping_to_json(feature_mapping_dataframe.value, label="text")
233
+ column_mapping = json.dumps({**labels, **features}, indent=2)
234
+ print('229 >>>>> ', column_mapping)
235
  if check_column_mapping_keys_validity(column_mapping, ppl) is False:
236
  gr.Warning('Label mapping table has invalid contents. Please check again.')
237
  return (gr.update(interactive=False),
 
239
  gr.update(),
240
  gr.update(),
241
  gr.update(),
242
+ gr.update(),
243
  gr.update())
244
  else:
245
  if model_id and dataset_id and dataset_config and dataset_split:
246
+ return try_validate(model_id, ppl, dataset_id, dataset_config, dataset_split, column_mapping)
247
  else:
 
 
248
  return (gr.update(interactive=False),
249
  gr.update(visible=True),
250
  gr.update(visible=False),
251
  gr.update(visible=False),
252
  gr.update(visible=False),
253
+ gr.update(visible=False),
254
  gr.update(visible=False))
255
  with gr.Row():
256
  gr.Markdown('''
 
261
  ''')
262
  with gr.Row():
263
  run_local = gr.Checkbox(value=True, label="Run in this Space")
264
+ run_inference = gr.Checkbox(value=False, label="Run with Inference API")
265
+
266
+ with gr.Row() as advanced_row:
267
+ selected = read_scanners('./scan_config.yaml')
268
+ scan_config = selected + ['data_leakage']
269
+ scanners = gr.CheckboxGroup(choices=scan_config, value=selected, label='Scan Settings', visible=True)
270
 
271
  with gr.Row():
272
  model_id_input = gr.Textbox(
 
290
 
291
  with gr.Row(visible=True) as loading_row:
292
  gr.Markdown('''
293
+ <p style="text-align: center;">
294
+ 🚀🐢Please validate your model and dataset first...
295
+ </p>
296
  ''')
297
+
298
  with gr.Row(visible=False) as preview_row:
299
  gr.Markdown('''
300
  <h1 style="text-align: center;">
 
305
 
306
  with gr.Row():
307
  id2label_mapping_dataframe = gr.DataFrame(label="Preview of label mapping", interactive=True, visible=False)
308
+ feature_mapping_dataframe = gr.DataFrame(label="Preview of feature mapping", interactive=True, visible=False)
309
  with gr.Row():
310
  example_input = gr.Markdown('Sample Input: ', visible=False)
311
 
 
321
 
322
  model_id_input.change(gate_validate_btn,
323
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
324
+ outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
325
  dataset_id_input.change(gate_validate_btn,
326
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
327
+ outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
328
  dataset_config_input.change(gate_validate_btn,
329
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
330
+ outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
331
  dataset_split_input.change(gate_validate_btn,
332
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
333
+ outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
334
  id2label_mapping_dataframe.input(gate_validate_btn,
335
+ inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input, id2label_mapping_dataframe, feature_mapping_dataframe],
336
+ outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
337
+ feature_mapping_dataframe.input(gate_validate_btn,
338
+ inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input, id2label_mapping_dataframe, feature_mapping_dataframe],
339
+ outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
340
+ scanners.change(write_scanners, inputs=scanners)
341
+
342
  run_btn.click(
343
  try_submit,
344
  inputs=[
scan_config.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ detectors:
2
+ - ethical_bias
3
+ - text_perturbation
4
+ - robustness
5
+ - performance
6
+ - underconfidence
7
+ - overconfidence
8
+ - spurious_correlation
text_classification.py CHANGED
@@ -19,9 +19,8 @@ def text_classification_map_model_and_dataset_labels(id2label, dataset_features)
19
  continue
20
  if len(feature.names) != len(id2label_mapping.keys()):
21
  continue
22
-
23
  dataset_labels = feature.names
24
-
25
  # Try to match labels
26
  for label in feature.names:
27
  if label in id2label_mapping.keys():
@@ -31,6 +30,8 @@ def text_classification_map_model_and_dataset_labels(id2label, dataset_features)
31
  model_label, label = text_classificaiton_match_label_case_unsensative(id2label_mapping, label)
32
  if model_label is not None:
33
  id2label_mapping[model_label] = label
 
 
34
 
35
  return id2label_mapping, dataset_labels
36
 
@@ -52,15 +53,15 @@ def check_column_mapping_keys_validity(column_mapping, ppl):
52
  def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
53
  # We assume dataset is ok here
54
  ds = datasets.load_dataset(d_id, config)[split]
55
-
56
  try:
57
  dataset_features = ds.features
58
  except AttributeError:
59
  # Dataset does not have features, need to provide everything
60
- return None, None, None
61
-
62
  # Check whether we need to infer the text input column
63
  infer_text_input_column = True
 
64
  if "text" in column_mapping.keys():
65
  dataset_text_column = column_mapping["text"]
66
  if dataset_text_column in dataset_features.keys():
@@ -71,12 +72,16 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
71
  if infer_text_input_column:
72
  # Try to retrieve one
73
  candidates = [f for f in dataset_features if dataset_features[f].dtype == "string"]
 
 
 
 
74
  if len(candidates) > 0:
75
  logging.debug(f"Candidates are {candidates}")
76
  column_mapping["text"] = candidates[0]
77
  else:
78
  # Not found a text feature
79
- return column_mapping, None, None
80
 
81
  # Load dataset as DataFrame
82
  df = ds.to_pandas()
@@ -85,24 +90,14 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
85
  id2label_mapping = {}
86
  id2label = ppl.model.config.id2label
87
  label2id = {v: k for k, v in id2label.items()}
88
- prediction_input = None
89
- prediction_result = None
90
- try:
91
- # Use the first item to test prediction
92
- prediction_input = df.head(1).at[0, column_mapping["text"]]
93
- results = ppl({"text": prediction_input}, top_k=None)
94
- prediction_result = {
95
- f'{result["label"]}({label2id[result["label"]]})': result["score"] for result in results
96
- }
97
- except Exception:
98
- # Pipeline prediction failed, need to provide labels
99
- return column_mapping, None, None
100
 
101
  # Infer labels
102
  id2label_mapping, dataset_labels = text_classification_map_model_and_dataset_labels(id2label, dataset_features)
103
  id2label_mapping_dataset_model = {
104
  v: k for k, v in id2label_mapping.items()
105
  }
 
 
106
  if "data" in column_mapping.keys():
107
  if isinstance(column_mapping["data"], list):
108
  # Use the column mapping passed by user
@@ -112,15 +107,30 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
112
  column_mapping["label"] = {
113
  i: None for i in id2label.keys()
114
  }
115
- return column_mapping, prediction_result, None
116
 
117
- prediction_result = {
118
- f'[{label2id[result["label"]]}]{result["label"]}(original) - {id2label_mapping[result["label"]]}(mapped)': result["score"] for result in results
119
- }
120
  id2label_df = pd.DataFrame({
121
  "Dataset Labels": dataset_labels,
122
  "Model Prediction Labels": [id2label_mapping_dataset_model[label] for label in dataset_labels],
123
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  if "data" not in column_mapping.keys():
126
  # Column mapping should contain original model labels
@@ -128,4 +138,4 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
128
  str(i): id2label_mapping_dataset_model[label] for i, label in zip(id2label.keys(), dataset_labels)
129
  }
130
 
131
- return column_mapping, prediction_input, prediction_result, id2label_df
 
19
  continue
20
  if len(feature.names) != len(id2label_mapping.keys()):
21
  continue
22
+
23
  dataset_labels = feature.names
 
24
  # Try to match labels
25
  for label in feature.names:
26
  if label in id2label_mapping.keys():
 
30
  model_label, label = text_classificaiton_match_label_case_unsensative(id2label_mapping, label)
31
  if model_label is not None:
32
  id2label_mapping[model_label] = label
33
+ else:
34
+ print(f"Label {label} is not found in model labels")
35
 
36
  return id2label_mapping, dataset_labels
37
 
 
53
  def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
54
  # We assume dataset is ok here
55
  ds = datasets.load_dataset(d_id, config)[split]
 
56
  try:
57
  dataset_features = ds.features
58
  except AttributeError:
59
  # Dataset does not have features, need to provide everything
60
+ return None, None, None, None, None
61
+
62
  # Check whether we need to infer the text input column
63
  infer_text_input_column = True
64
+ feature_map_df = None
65
  if "text" in column_mapping.keys():
66
  dataset_text_column = column_mapping["text"]
67
  if dataset_text_column in dataset_features.keys():
 
72
  if infer_text_input_column:
73
  # Try to retrieve one
74
  candidates = [f for f in dataset_features if dataset_features[f].dtype == "string"]
75
+ feature_map_df = pd.DataFrame({
76
+ "Dataset Features": [candidates[0]],
77
+ "Model Input Features": ["text"]
78
+ })
79
  if len(candidates) > 0:
80
  logging.debug(f"Candidates are {candidates}")
81
  column_mapping["text"] = candidates[0]
82
  else:
83
  # Not found a text feature
84
+ return column_mapping, None, None, feature_map_df
85
 
86
  # Load dataset as DataFrame
87
  df = ds.to_pandas()
 
90
  id2label_mapping = {}
91
  id2label = ppl.model.config.id2label
92
  label2id = {v: k for k, v in id2label.items()}
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  # Infer labels
95
  id2label_mapping, dataset_labels = text_classification_map_model_and_dataset_labels(id2label, dataset_features)
96
  id2label_mapping_dataset_model = {
97
  v: k for k, v in id2label_mapping.items()
98
  }
99
+
100
+ # TODO: convert dataframe column mapping to json properly
101
  if "data" in column_mapping.keys():
102
  if isinstance(column_mapping["data"], list):
103
  # Use the column mapping passed by user
 
107
  column_mapping["label"] = {
108
  i: None for i in id2label.keys()
109
  }
110
+ return column_mapping, None, None, None, feature_map_df
111
 
 
 
 
112
  id2label_df = pd.DataFrame({
113
  "Dataset Labels": dataset_labels,
114
  "Model Prediction Labels": [id2label_mapping_dataset_model[label] for label in dataset_labels],
115
  })
116
+
117
+ prediction_input = None
118
+ prediction_result = None
119
+ try:
120
+ # Use the first item to test prediction
121
+ prediction_input = df.head(1).at[0, column_mapping["text"]]
122
+ results = ppl({"text": prediction_input}, top_k=None)
123
+ prediction_result = {
124
+ f'{result["label"]}({label2id[result["label"]]})': result["score"] for result in results
125
+ }
126
+ except Exception as e:
127
+ # Pipeline prediction failed, need to provide labels
128
+ print(e, '>>>> error')
129
+ return column_mapping, prediction_input, None, id2label_df, feature_map_df
130
+
131
+ prediction_result = {
132
+ f'[{label2id[result["label"]]}]{result["label"]}(original) - {id2label_mapping[result["label"]]}(mapped)': result["score"] for result in results
133
+ }
134
 
135
  if "data" not in column_mapping.keys():
136
  # Column mapping should contain original model labels
 
138
  str(i): id2label_mapping_dataset_model[label] for i, label in zip(id2label.keys(), dataset_labels)
139
  }
140
 
141
+ return column_mapping, prediction_input, prediction_result, id2label_df, feature_map_df
utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import sys
3
+ # read scanners from yaml file
4
+ # return a list of scanners
5
+ def read_scanners(path):
6
+ scanners = []
7
+ with open(path, "r") as f:
8
+ config = yaml.load(f, Loader=yaml.FullLoader)
9
+ scanners = config.get("detectors", None)
10
+ return scanners
11
+
12
+ # convert a list of scanners to yaml file
13
+ def write_scanners(scanners):
14
+ with open("./scan_config.yaml", "w") as f:
15
+ # save scanners to detectors in yaml
16
+ yaml.dump({"detectors": scanners}, f)
17
+
18
+ # convert column mapping dataframe to json
19
+ def convert_column_mapping_to_json(df, label=""):
20
+ column_mapping = {}
21
+ column_mapping[label] = []
22
+ for _, row in df.iterrows():
23
+ column_mapping[label].append(row.tolist())
24
+ return column_mapping