Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna HF staff commited on Sep 25

Commit

ac73d94

•

1 Parent(s): 9962eae

fix toxicity for bad requests

Browse files

Files changed (1) hide show

app.py +15 -8

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import requests
 from collections import Counter
-from fontTools.subset import subset
 from requests.adapters import HTTPAdapter, Retry
 import os
 import time
@@ -101,6 +100,7 @@ def run_quality_check(dataset, config, split, column, nested_column, batch_size,
     try:
         logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
         data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
     except Exception as error:
         yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -151,7 +151,8 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
     headers = {
         "content-type": "application/json",
     }
-    req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
     # fetch data if it doesn't exist yet
     if texts_df.values.tolist() == [['', '', '']]:
@@ -164,6 +165,7 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
         try:
             logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
             texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
         except Exception as error:
             yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -188,11 +190,13 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
         except Exception as e:
             logging.info(e)
             logging.info(data)
-            return req_att_scores
         if req_response.ok:
             response = req_response.json()
             if ATT_SCORE in response:
                 for req_att in REQUESTED_ATTRIBUTES:
                     if req_att in response[ATT_SCORE]:
                         att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
@@ -206,13 +210,16 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
                 req_response.raise_for_status()
             except Exception as e:
                 logging.info(e)
-                return req_att_scores
         if i % 10 == 0:
             plot_toxicity(req_att_scores)
-            yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts[:i+1], **req_att_scores})
     plot_toxicity(req_att_scores)
-    yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
 with gr.Blocks() as demo:
@@ -326,7 +333,7 @@ with gr.Blocks() as demo:
     gr.Markdown("## Run nvidia quality classifier")
     batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
     num_examples = gr.Slider(0, 5000, 500, step=10, label="Number of examples", info="Number of random examples to run quality classifier on")
-    gr_check_btn = gr.Button("Check Dataset")
     progress_bar = gr.Label(show_label=False)
     plot = gr.BarPlot()
@@ -365,7 +372,7 @@ with gr.Blocks() as demo:
     gr.Markdown("""## Explore toxicity
     Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
     """)
-    gr_toxicity_btn = gr.Button("Run Perpspective")
     toxicity_progress_bar = gr.Label(show_label=False)
     toxicity_hist = gr.Plot()
     with gr.Accordion("Explore examples with toxicity scores:", open=False):

 import requests
 from collections import Counter
 from requests.adapters import HTTPAdapter, Retry
 import os
 import time
     try:
         logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
+        yield f"loading data...", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
         data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column])
     except Exception as error:
         yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
     headers = {
         "content-type": "application/json",
     }
+    req_att_scores = {**{attr: [] for attr in REQUESTED_ATTRIBUTES}}
+    texts_processed = {column_name: []}
     # fetch data if it doesn't exist yet
     if texts_df.values.tolist() == [['', '', '']]:
         try:
             logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
+            yield f"loading data...", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
             texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
         except Exception as error:
             yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
         except Exception as e:
             logging.info(e)
             logging.info(data)
+            yield {"bad request, example skipped...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
+            continue
         if req_response.ok:
             response = req_response.json()
             if ATT_SCORE in response:
+                texts_processed[column_name].append(text)
                 for req_att in REQUESTED_ATTRIBUTES:
                     if req_att in response[ATT_SCORE]:
                         att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
                 req_response.raise_for_status()
             except Exception as e:
                 logging.info(e)
+                logging.info(data)
+                yield {"bad request, example skipped": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
+                continue
         if i % 10 == 0:
             plot_toxicity(req_att_scores)
+            yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
     plot_toxicity(req_att_scores)
+    yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({**texts_processed, **req_att_scores})
 with gr.Blocks() as demo:
     gr.Markdown("## Run nvidia quality classifier")
     batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
     num_examples = gr.Slider(0, 5000, 500, step=10, label="Number of examples", info="Number of random examples to run quality classifier on")
+    gr_check_btn = gr.Button("Check Quality")
     progress_bar = gr.Label(show_label=False)
     plot = gr.BarPlot()
     gr.Markdown("""## Explore toxicity
     Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
     """)
+    gr_toxicity_btn = gr.Button("Check Toxicity")
     toxicity_progress_bar = gr.Label(show_label=False)
     toxicity_hist = gr.Plot()
     with gr.Accordion("Explore examples with toxicity scores:", open=False):