Spaces:
Running
on
Zero
Running
on
Zero
Commit
β’
ac73d94
1
Parent(s):
9962eae
fix toxicity for bad requests
Browse files
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import requests
|
2 |
from collections import Counter
|
3 |
|
4 |
-
from fontTools.subset import subset
|
5 |
from requests.adapters import HTTPAdapter, Retry
|
6 |
import os
|
7 |
import time
|
@@ -101,6 +100,7 @@ def run_quality_check(dataset, config, split, column, nested_column, batch_size,
|
|
101 |
|
102 |
try:
|
103 |
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
|
|
104 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
|
105 |
except Exception as error:
|
106 |
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
@@ -151,7 +151,8 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
|
|
151 |
headers = {
|
152 |
"content-type": "application/json",
|
153 |
}
|
154 |
-
req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
|
|
|
155 |
|
156 |
# fetch data if it doesn't exist yet
|
157 |
if texts_df.values.tolist() == [['', '', '']]:
|
@@ -164,6 +165,7 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
|
|
164 |
|
165 |
try:
|
166 |
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
|
|
167 |
texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
|
168 |
except Exception as error:
|
169 |
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
@@ -188,11 +190,13 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
|
|
188 |
except Exception as e:
|
189 |
logging.info(e)
|
190 |
logging.info(data)
|
191 |
-
|
|
|
192 |
|
193 |
if req_response.ok:
|
194 |
response = req_response.json()
|
195 |
if ATT_SCORE in response:
|
|
|
196 |
for req_att in REQUESTED_ATTRIBUTES:
|
197 |
if req_att in response[ATT_SCORE]:
|
198 |
att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
|
@@ -206,13 +210,16 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
|
|
206 |
req_response.raise_for_status()
|
207 |
except Exception as e:
|
208 |
logging.info(e)
|
209 |
-
|
|
|
|
|
|
|
210 |
if i % 10 == 0:
|
211 |
plot_toxicity(req_att_scores)
|
212 |
-
yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({
|
213 |
|
214 |
plot_toxicity(req_att_scores)
|
215 |
-
yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({
|
216 |
|
217 |
|
218 |
with gr.Blocks() as demo:
|
@@ -326,7 +333,7 @@ with gr.Blocks() as demo:
|
|
326 |
gr.Markdown("## Run nvidia quality classifier")
|
327 |
batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
|
328 |
num_examples = gr.Slider(0, 5000, 500, step=10, label="Number of examples", info="Number of random examples to run quality classifier on")
|
329 |
-
gr_check_btn = gr.Button("Check
|
330 |
progress_bar = gr.Label(show_label=False)
|
331 |
plot = gr.BarPlot()
|
332 |
|
@@ -365,7 +372,7 @@ with gr.Blocks() as demo:
|
|
365 |
gr.Markdown("""## Explore toxicity
|
366 |
Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
|
367 |
""")
|
368 |
-
gr_toxicity_btn = gr.Button("
|
369 |
toxicity_progress_bar = gr.Label(show_label=False)
|
370 |
toxicity_hist = gr.Plot()
|
371 |
with gr.Accordion("Explore examples with toxicity scores:", open=False):
|
|
|
1 |
import requests
|
2 |
from collections import Counter
|
3 |
|
|
|
4 |
from requests.adapters import HTTPAdapter, Retry
|
5 |
import os
|
6 |
import time
|
|
|
100 |
|
101 |
try:
|
102 |
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
103 |
+
yield f"loading data...", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
104 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
|
105 |
except Exception as error:
|
106 |
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
|
151 |
headers = {
|
152 |
"content-type": "application/json",
|
153 |
}
|
154 |
+
req_att_scores = {**{attr: [] for attr in REQUESTED_ATTRIBUTES}}
|
155 |
+
texts_processed = {column_name: []}
|
156 |
|
157 |
# fetch data if it doesn't exist yet
|
158 |
if texts_df.values.tolist() == [['', '', '']]:
|
|
|
165 |
|
166 |
try:
|
167 |
logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
|
168 |
+
yield f"loading data...", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
169 |
texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
|
170 |
except Exception as error:
|
171 |
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
|
190 |
except Exception as e:
|
191 |
logging.info(e)
|
192 |
logging.info(data)
|
193 |
+
yield {"bad request, example skipped...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
|
194 |
+
continue
|
195 |
|
196 |
if req_response.ok:
|
197 |
response = req_response.json()
|
198 |
if ATT_SCORE in response:
|
199 |
+
texts_processed[column_name].append(text)
|
200 |
for req_att in REQUESTED_ATTRIBUTES:
|
201 |
if req_att in response[ATT_SCORE]:
|
202 |
att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
|
|
|
210 |
req_response.raise_for_status()
|
211 |
except Exception as e:
|
212 |
logging.info(e)
|
213 |
+
logging.info(data)
|
214 |
+
yield {"bad request, example skipped": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
|
215 |
+
continue
|
216 |
+
|
217 |
if i % 10 == 0:
|
218 |
plot_toxicity(req_att_scores)
|
219 |
+
yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
|
220 |
|
221 |
plot_toxicity(req_att_scores)
|
222 |
+
yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
|
223 |
|
224 |
|
225 |
with gr.Blocks() as demo:
|
|
|
333 |
gr.Markdown("## Run nvidia quality classifier")
|
334 |
batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
|
335 |
num_examples = gr.Slider(0, 5000, 500, step=10, label="Number of examples", info="Number of random examples to run quality classifier on")
|
336 |
+
gr_check_btn = gr.Button("Check Quality")
|
337 |
progress_bar = gr.Label(show_label=False)
|
338 |
plot = gr.BarPlot()
|
339 |
|
|
|
372 |
gr.Markdown("""## Explore toxicity
|
373 |
Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
|
374 |
""")
|
375 |
+
gr_toxicity_btn = gr.Button("Check Toxicity")
|
376 |
toxicity_progress_bar = gr.Label(show_label=False)
|
377 |
toxicity_hist = gr.Plot()
|
378 |
with gr.Accordion("Explore examples with toxicity scores:", open=False):
|