polinaeterna HF staff commited on
Commit
ac73d94
β€’
1 Parent(s): 9962eae

fix toxicity for bad requests

Browse files
Files changed (1) hide show
  1. app.py +15 -8
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import requests
2
  from collections import Counter
3
 
4
- from fontTools.subset import subset
5
  from requests.adapters import HTTPAdapter, Retry
6
  import os
7
  import time
@@ -101,6 +100,7 @@ def run_quality_check(dataset, config, split, column, nested_column, batch_size,
101
 
102
  try:
103
  logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
 
104
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
105
  except Exception as error:
106
  yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -151,7 +151,8 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
151
  headers = {
152
  "content-type": "application/json",
153
  }
154
- req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
 
155
 
156
  # fetch data if it doesn't exist yet
157
  if texts_df.values.tolist() == [['', '', '']]:
@@ -164,6 +165,7 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
164
 
165
  try:
166
  logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
 
167
  texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
168
  except Exception as error:
169
  yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -188,11 +190,13 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
188
  except Exception as e:
189
  logging.info(e)
190
  logging.info(data)
191
- return req_att_scores
 
192
 
193
  if req_response.ok:
194
  response = req_response.json()
195
  if ATT_SCORE in response:
 
196
  for req_att in REQUESTED_ATTRIBUTES:
197
  if req_att in response[ATT_SCORE]:
198
  att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
@@ -206,13 +210,16 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
206
  req_response.raise_for_status()
207
  except Exception as e:
208
  logging.info(e)
209
- return req_att_scores
 
 
 
210
  if i % 10 == 0:
211
  plot_toxicity(req_att_scores)
212
- yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts[:i+1], **req_att_scores})
213
 
214
  plot_toxicity(req_att_scores)
215
- yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
216
 
217
 
218
  with gr.Blocks() as demo:
@@ -326,7 +333,7 @@ with gr.Blocks() as demo:
326
  gr.Markdown("## Run nvidia quality classifier")
327
  batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
328
  num_examples = gr.Slider(0, 5000, 500, step=10, label="Number of examples", info="Number of random examples to run quality classifier on")
329
- gr_check_btn = gr.Button("Check Dataset")
330
  progress_bar = gr.Label(show_label=False)
331
  plot = gr.BarPlot()
332
 
@@ -365,7 +372,7 @@ with gr.Blocks() as demo:
365
  gr.Markdown("""## Explore toxicity
366
  Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
367
  """)
368
- gr_toxicity_btn = gr.Button("Run Perpspective")
369
  toxicity_progress_bar = gr.Label(show_label=False)
370
  toxicity_hist = gr.Plot()
371
  with gr.Accordion("Explore examples with toxicity scores:", open=False):
 
1
  import requests
2
  from collections import Counter
3
 
 
4
  from requests.adapters import HTTPAdapter, Retry
5
  import os
6
  import time
 
100
 
101
  try:
102
  logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
103
+ yield f"loading data...", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
104
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
105
  except Exception as error:
106
  yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
 
151
  headers = {
152
  "content-type": "application/json",
153
  }
154
+ req_att_scores = {**{attr: [] for attr in REQUESTED_ATTRIBUTES}}
155
+ texts_processed = {column_name: []}
156
 
157
  # fetch data if it doesn't exist yet
158
  if texts_df.values.tolist() == [['', '', '']]:
 
165
 
166
  try:
167
  logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
168
+ yield f"loading data...", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
169
  texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
170
  except Exception as error:
171
  yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
 
190
  except Exception as e:
191
  logging.info(e)
192
  logging.info(data)
193
+ yield {"bad request, example skipped...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
194
+ continue
195
 
196
  if req_response.ok:
197
  response = req_response.json()
198
  if ATT_SCORE in response:
199
+ texts_processed[column_name].append(text)
200
  for req_att in REQUESTED_ATTRIBUTES:
201
  if req_att in response[ATT_SCORE]:
202
  att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
 
210
  req_response.raise_for_status()
211
  except Exception as e:
212
  logging.info(e)
213
+ logging.info(data)
214
+ yield {"bad request, example skipped": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
215
+ continue
216
+
217
  if i % 10 == 0:
218
  plot_toxicity(req_att_scores)
219
+ yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
220
 
221
  plot_toxicity(req_att_scores)
222
+ yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
223
 
224
 
225
  with gr.Blocks() as demo:
 
333
  gr.Markdown("## Run nvidia quality classifier")
334
  batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
335
  num_examples = gr.Slider(0, 5000, 500, step=10, label="Number of examples", info="Number of random examples to run quality classifier on")
336
+ gr_check_btn = gr.Button("Check Quality")
337
  progress_bar = gr.Label(show_label=False)
338
  plot = gr.BarPlot()
339
 
 
372
  gr.Markdown("""## Explore toxicity
373
  Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
374
  """)
375
+ gr_toxicity_btn = gr.Button("Check Toxicity")
376
  toxicity_progress_bar = gr.Label(show_label=False)
377
  toxicity_hist = gr.Plot()
378
  with gr.Accordion("Explore examples with toxicity scores:", open=False):