ZeroCommand commited on
Commit
09f3a52
1 Parent(s): a48ba21

update inference api arg in yaml; update error handling

Browse files
Files changed (4) hide show
  1. app.py +27 -14
  2. scan_config.yaml → config.yaml +3 -1
  3. text_classification.py +27 -15
  4. utils.py +33 -3
app.py CHANGED
@@ -11,7 +11,7 @@ import json
11
  from transformers.pipelines import TextClassificationPipeline
12
 
13
  from text_classification import check_column_mapping_keys_validity, text_classification_fix_column_mapping
14
- from utils import read_scanners, write_scanners, convert_column_mapping_to_json
15
 
16
  HF_REPO_ID = 'HF_REPO_ID'
17
  HF_SPACE_ID = 'SPACE_ID'
@@ -160,10 +160,14 @@ def try_validate(m_id, ppl, dataset_id, dataset_config, dataset_split, column_ma
160
  )
161
 
162
 
163
- def try_submit(m_id, d_id, config, split, column_mappings, local):
164
  label_mapping = {}
165
- for i, label in column_mappings["Model Prediction Labels"].items():
166
  label_mapping.update({str(i): label})
 
 
 
 
167
 
168
  # TODO: Set column mapping for some dataset such as `amazon_polarity`
169
 
@@ -180,9 +184,9 @@ def try_submit(m_id, d_id, config, split, column_mappings, local):
180
  "--discussion_repo", os.environ.get(HF_REPO_ID) or os.environ.get(HF_SPACE_ID),
181
  "--output_format", "markdown",
182
  "--output_portal", "huggingface",
183
- # TODO: "--feature_mapping", json.dumps(column_mapping),
184
  "--label_mapping", json.dumps(label_mapping),
185
- "--scan_config", "./scan_config.yaml",
186
  ]
187
 
188
  eval_str = f"[{m_id}]<{d_id}({config}, {split} set)>"
@@ -227,11 +231,12 @@ with gr.Blocks(theme=theme) as iface:
227
  def gate_validate_btn(model_id, dataset_id, dataset_config, dataset_split, id2label_mapping_dataframe=None, feature_mapping_dataframe=None):
228
  column_mapping = '{}'
229
  _, ppl = check_model(model_id=model_id)
 
230
  if id2label_mapping_dataframe is not None:
231
  labels = convert_column_mapping_to_json(id2label_mapping_dataframe.value, label="data")
232
  features = convert_column_mapping_to_json(feature_mapping_dataframe.value, label="text")
233
  column_mapping = json.dumps({**labels, **features}, indent=2)
234
- print('229 >>>>> ', column_mapping)
235
  if check_column_mapping_keys_validity(column_mapping, ppl) is False:
236
  gr.Warning('Label mapping table has invalid contents. Please check again.')
237
  return (gr.update(interactive=False),
@@ -261,10 +266,11 @@ with gr.Blocks(theme=theme) as iface:
261
  ''')
262
  with gr.Row():
263
  run_local = gr.Checkbox(value=True, label="Run in this Space")
264
- run_inference = gr.Checkbox(value=False, label="Run with Inference API")
 
265
 
266
  with gr.Row() as advanced_row:
267
- selected = read_scanners('./scan_config.yaml')
268
  scan_config = selected + ['data_leakage']
269
  scanners = gr.CheckboxGroup(choices=scan_config, value=selected, label='Scan Settings', visible=True)
270
 
@@ -282,8 +288,10 @@ with gr.Blocks(theme=theme) as iface:
282
  dataset_config_input = gr.Dropdown(['default'], value='default', label='Dataset Config', visible=False)
283
  dataset_split_input = gr.Dropdown(['default'], value='default', label='Dataset Split', visible=False)
284
 
285
- dataset_id_input.change(check_dataset_and_get_config, dataset_id_input, dataset_config_input)
286
- dataset_config_input.change(
 
 
287
  check_dataset_and_get_split,
288
  inputs=[dataset_config_input, dataset_id_input],
289
  outputs=[dataset_split_input])
@@ -319,16 +327,16 @@ with gr.Blocks(theme=theme) as iface:
319
  size="lg",
320
  )
321
 
322
- model_id_input.change(gate_validate_btn,
323
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
324
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
325
- dataset_id_input.change(gate_validate_btn,
326
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
327
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
328
- dataset_config_input.change(gate_validate_btn,
329
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
330
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
331
- dataset_split_input.change(gate_validate_btn,
332
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
333
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
334
  id2label_mapping_dataframe.input(gate_validate_btn,
@@ -338,6 +346,10 @@ with gr.Blocks(theme=theme) as iface:
338
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input, id2label_mapping_dataframe, feature_mapping_dataframe],
339
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
340
  scanners.change(write_scanners, inputs=scanners)
 
 
 
 
341
 
342
  run_btn.click(
343
  try_submit,
@@ -347,6 +359,7 @@ with gr.Blocks(theme=theme) as iface:
347
  dataset_config_input,
348
  dataset_split_input,
349
  id2label_mapping_dataframe,
 
350
  run_local,
351
  ],
352
  outputs=[
 
11
  from transformers.pipelines import TextClassificationPipeline
12
 
13
  from text_classification import check_column_mapping_keys_validity, text_classification_fix_column_mapping
14
+ from utils import read_scanners, write_scanners, read_model_type, write_model_type, convert_column_mapping_to_json
15
 
16
  HF_REPO_ID = 'HF_REPO_ID'
17
  HF_SPACE_ID = 'SPACE_ID'
 
160
  )
161
 
162
 
163
+ def try_submit(m_id, d_id, config, split, id2label_mapping_dataframe, feature_mapping_dataframe, local):
164
  label_mapping = {}
165
+ for i, label in id2label_mapping_dataframe["Model Prediction Labels"].items():
166
  label_mapping.update({str(i): label})
167
+
168
+ feature_mapping = {}
169
+ for i, feature in feature_mapping_dataframe["Dataset Features"].items():
170
+ feature_mapping.update({feature_mapping_dataframe["Model Input Features"][i]: feature})
171
 
172
  # TODO: Set column mapping for some dataset such as `amazon_polarity`
173
 
 
184
  "--discussion_repo", os.environ.get(HF_REPO_ID) or os.environ.get(HF_SPACE_ID),
185
  "--output_format", "markdown",
186
  "--output_portal", "huggingface",
187
+ "--feature_mapping", json.dumps(feature_mapping),
188
  "--label_mapping", json.dumps(label_mapping),
189
+ "--scan_config", "./config.yaml",
190
  ]
191
 
192
  eval_str = f"[{m_id}]<{d_id}({config}, {split} set)>"
 
231
  def gate_validate_btn(model_id, dataset_id, dataset_config, dataset_split, id2label_mapping_dataframe=None, feature_mapping_dataframe=None):
232
  column_mapping = '{}'
233
  _, ppl = check_model(model_id=model_id)
234
+
235
  if id2label_mapping_dataframe is not None:
236
  labels = convert_column_mapping_to_json(id2label_mapping_dataframe.value, label="data")
237
  features = convert_column_mapping_to_json(feature_mapping_dataframe.value, label="text")
238
  column_mapping = json.dumps({**labels, **features}, indent=2)
239
+
240
  if check_column_mapping_keys_validity(column_mapping, ppl) is False:
241
  gr.Warning('Label mapping table has invalid contents. Please check again.')
242
  return (gr.update(interactive=False),
 
266
  ''')
267
  with gr.Row():
268
  run_local = gr.Checkbox(value=True, label="Run in this Space")
269
+ use_inference = read_model_type('./config.yaml')[0] == 'hf_inference_api'
270
+ run_inference = gr.Checkbox(value=use_inference, label="Run with Inference API")
271
 
272
  with gr.Row() as advanced_row:
273
+ selected = read_scanners('./config.yaml')
274
  scan_config = selected + ['data_leakage']
275
  scanners = gr.CheckboxGroup(choices=scan_config, value=selected, label='Scan Settings', visible=True)
276
 
 
288
  dataset_config_input = gr.Dropdown(['default'], value='default', label='Dataset Config', visible=False)
289
  dataset_split_input = gr.Dropdown(['default'], value='default', label='Dataset Split', visible=False)
290
 
291
+ dataset_id_input.blur(check_dataset_and_get_config, dataset_id_input, dataset_config_input)
292
+ dataset_id_input.submit(check_dataset_and_get_config, dataset_id_input, dataset_config_input)
293
+
294
+ dataset_config_input.blur(
295
  check_dataset_and_get_split,
296
  inputs=[dataset_config_input, dataset_id_input],
297
  outputs=[dataset_split_input])
 
327
  size="lg",
328
  )
329
 
330
+ model_id_input.blur(gate_validate_btn,
331
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
332
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
333
+ dataset_id_input.blur(gate_validate_btn,
334
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
335
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
336
+ dataset_config_input.input(gate_validate_btn,
337
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
338
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
339
+ dataset_split_input.input(gate_validate_btn,
340
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input],
341
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
342
  id2label_mapping_dataframe.input(gate_validate_btn,
 
346
  inputs=[model_id_input, dataset_id_input, dataset_config_input, dataset_split_input, id2label_mapping_dataframe, feature_mapping_dataframe],
347
  outputs=[run_btn, loading_row, preview_row, example_input, example_labels, id2label_mapping_dataframe, feature_mapping_dataframe])
348
  scanners.change(write_scanners, inputs=scanners)
349
+ run_inference.change(
350
+ write_model_type,
351
+ inputs=[run_inference]
352
+ )
353
 
354
  run_btn.click(
355
  try_submit,
 
359
  dataset_config_input,
360
  dataset_split_input,
361
  id2label_mapping_dataframe,
362
+ feature_mapping_dataframe,
363
  run_local,
364
  ],
365
  outputs=[
scan_config.yaml → config.yaml RENAMED
@@ -5,4 +5,6 @@ detectors:
5
  - performance
6
  - underconfidence
7
  - overconfidence
8
- - spurious_correlation
 
 
 
5
  - performance
6
  - underconfidence
7
  - overconfidence
8
+ - spurious_correlation
9
+ model_type:
10
+ - hf_inference_api
text_classification.py CHANGED
@@ -35,7 +35,18 @@ def text_classification_map_model_and_dataset_labels(id2label, dataset_features)
35
 
36
  return id2label_mapping, dataset_labels
37
 
38
-
 
 
 
 
 
 
 
 
 
 
 
39
  def check_column_mapping_keys_validity(column_mapping, ppl):
40
  # get the element in all the list elements
41
  column_mapping = json.loads(column_mapping)
@@ -49,16 +60,7 @@ def check_column_mapping_keys_validity(column_mapping, ppl):
49
 
50
  return user_labels == model_labels == original_labels
51
 
52
-
53
- def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
54
- # We assume dataset is ok here
55
- ds = datasets.load_dataset(d_id, config)[split]
56
- try:
57
- dataset_features = ds.features
58
- except AttributeError:
59
- # Dataset does not have features, need to provide everything
60
- return None, None, None, None, None
61
-
62
  # Check whether we need to infer the text input column
63
  infer_text_input_column = True
64
  feature_map_df = None
@@ -79,9 +81,19 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
79
  if len(candidates) > 0:
80
  logging.debug(f"Candidates are {candidates}")
81
  column_mapping["text"] = candidates[0]
82
- else:
83
- # Not found a text feature
84
- return column_mapping, None, None, feature_map_df
 
 
 
 
 
 
 
 
 
 
85
 
86
  # Load dataset as DataFrame
87
  df = ds.to_pandas()
@@ -97,7 +109,6 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
97
  v: k for k, v in id2label_mapping.items()
98
  }
99
 
100
- # TODO: convert dataframe column mapping to json properly
101
  if "data" in column_mapping.keys():
102
  if isinstance(column_mapping["data"], list):
103
  # Use the column mapping passed by user
@@ -114,6 +125,7 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
114
  "Model Prediction Labels": [id2label_mapping_dataset_model[label] for label in dataset_labels],
115
  })
116
 
 
117
  prediction_input = None
118
  prediction_result = None
119
  try:
 
35
 
36
  return id2label_mapping, dataset_labels
37
 
38
+ '''
39
+ params:
40
+ column_mapping: dict
41
+ example: {
42
+ "text": "sentences",
43
+ "label": {
44
+ "label0": "LABEL_0",
45
+ "label1": "LABEL_1"
46
+ }
47
+ }
48
+ ppl: pipeline
49
+ '''
50
  def check_column_mapping_keys_validity(column_mapping, ppl):
51
  # get the element in all the list elements
52
  column_mapping = json.loads(column_mapping)
 
60
 
61
  return user_labels == model_labels == original_labels
62
 
63
+ def infer_text_input_column(column_mapping, dataset_features):
 
 
 
 
 
 
 
 
 
64
  # Check whether we need to infer the text input column
65
  infer_text_input_column = True
66
  feature_map_df = None
 
81
  if len(candidates) > 0:
82
  logging.debug(f"Candidates are {candidates}")
83
  column_mapping["text"] = candidates[0]
84
+
85
+ return column_mapping, feature_map_df
86
+
87
+ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split):
88
+ # We assume dataset is ok here
89
+ ds = datasets.load_dataset(d_id, config)[split]
90
+ try:
91
+ dataset_features = ds.features
92
+ except AttributeError:
93
+ # Dataset does not have features, need to provide everything
94
+ return None, None, None, None, None
95
+
96
+ column_mapping, feature_map_df = infer_text_input_column(column_mapping, dataset_features)
97
 
98
  # Load dataset as DataFrame
99
  df = ds.to_pandas()
 
109
  v: k for k, v in id2label_mapping.items()
110
  }
111
 
 
112
  if "data" in column_mapping.keys():
113
  if isinstance(column_mapping["data"], list):
114
  # Use the column mapping passed by user
 
125
  "Model Prediction Labels": [id2label_mapping_dataset_model[label] for label in dataset_labels],
126
  })
127
 
128
+ # get a sample prediction from the model on the dataset
129
  prediction_input = None
130
  prediction_result = None
131
  try:
utils.py CHANGED
@@ -1,5 +1,11 @@
1
  import yaml
2
- import sys
 
 
 
 
 
 
3
  # read scanners from yaml file
4
  # return a list of scanners
5
  def read_scanners(path):
@@ -11,9 +17,33 @@ def read_scanners(path):
11
 
12
  # convert a list of scanners to yaml file
13
  def write_scanners(scanners):
14
- with open("./scan_config.yaml", "w") as f:
 
 
 
 
15
  # save scanners to detectors in yaml
16
- yaml.dump({"detectors": scanners}, f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # convert column mapping dataframe to json
19
  def convert_column_mapping_to_json(df, label=""):
 
1
  import yaml
2
+
3
+ YAML_PATH = "./config.yaml"
4
+
5
+ class Dumper(yaml.Dumper):
6
+ def increase_indent(self, flow=False, *args, **kwargs):
7
+ return super().increase_indent(flow=flow, indentless=False)
8
+
9
  # read scanners from yaml file
10
  # return a list of scanners
11
  def read_scanners(path):
 
17
 
18
  # convert a list of scanners to yaml file
19
  def write_scanners(scanners):
20
+ with open(YAML_PATH, "r") as f:
21
+ config = yaml.load(f, Loader=yaml.FullLoader)
22
+
23
+ config["detectors"] = scanners
24
+ with open(YAML_PATH, "w") as f:
25
  # save scanners to detectors in yaml
26
+ yaml.dump(config, f, Dumper=Dumper)
27
+
28
+ # read model_type from yaml file
29
+ def read_model_type(path):
30
+ model_type = ""
31
+ with open(path, "r") as f:
32
+ config = yaml.load(f, Loader=yaml.FullLoader)
33
+ model_type = config.get("model_type", None)
34
+ return model_type
35
+
36
+ # write model_type to yaml file
37
+ def write_model_type(use_inference):
38
+ with open(YAML_PATH, "r") as f:
39
+ config = yaml.load(f, Loader=yaml.FullLoader)
40
+ if use_inference:
41
+ config["model_type"] = ['hf_inference_api']
42
+ else:
43
+ config["model_type"] = ['hf_pipeline']
44
+ with open(YAML_PATH, "w") as f:
45
+ # save model_type to model_type in yaml
46
+ yaml.dump(config, f, Dumper=Dumper)
47
 
48
  # convert column mapping dataframe to json
49
  def convert_column_mapping_to_json(df, label=""):