Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna commited on Sep 9, 2024

Commit

3bb5a93

1 Parent(s): 3dcef48

ad toxicity check

Browse files

Files changed (1) hide show

app.py +87 -8

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import multiprocessing
 import gradio as gr
 import pandas as pd
 import polars as pl
-import numpy as np
 import matplotlib.pyplot as plt
 import spaces
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
@@ -90,12 +89,83 @@ def plot_and_df(texts, preds):
         )
-@spaces.GPU
 def run_quality_check(dataset, column, batch_size, num_examples):
-    # config = "default"
     info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
     if "error" in info_resp:
-        yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure()
         return
     config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
     split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
@@ -106,9 +176,10 @@ def run_quality_check(dataset, column, batch_size, num_examples):
         try:
             data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
         except Exception as error:
-            yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure()
             return
     texts = data[column].to_list()
     # batch_size = 100
     predictions, texts_processed = [], []
     num_examples = min(len(texts), num_examples)
@@ -117,7 +188,7 @@ def run_quality_check(dataset, column, batch_size, num_examples):
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
-        yield {"check in progress...": (i+batch_size) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure()
     with multiprocessing.Pool(processes=8) as pool:
         props = pool.map(proportion_non_ascii, texts)
@@ -128,7 +199,8 @@ def run_quality_check(dataset, column, batch_size, num_examples):
     plt.xlabel('Proportion of non-ASCII characters')
     plt.ylabel('Number of texts')
-    yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf()
 with gr.Blocks() as demo:
     gr.Markdown(
@@ -175,6 +247,13 @@ with gr.Blocks() as demo:
     # non_ascii_hist = gr.DataFrame(visible=False)
     non_ascii_hist = gr.Plot()
-    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[progress_bar, plot, df_low, df_medium, df_high, non_ascii_hist])
 demo.launch()

 import gradio as gr
 import pandas as pd
 import polars as pl
 import matplotlib.pyplot as plt
 import spaces
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
         )
+PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
+PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
+REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY": {},
+                        "IDENTITY_ATTACK": {}, "INSULT": {}, "PROFANITY": {},
+                        "THREAT": {}}
+ATT_SCORE = "attributeScores"
+SUM_SCORE = "summaryScore"
+def plot_toxicity(scores):
+    fig, axs = plt.subplots(2, 3)#, figsize=(10, 6))
+    for x, y, score_name in zip([0,0,0,1,1,1], [0,1,2,0,1,2], scores):
+        axs[x,y].hist(scores[score_name], bins=20, range=(0., 1.))
+        # axs[x,y].set_title(f'Histogram of {score_name}')
+        axs[x,y].set_xlabel(f'{score_name}')
+        # axs[x,y].set_ylabel('Number of texts')
+    fig.supylabel("Number of texts")
+    fig.suptitle("Histogram of toxicity scores")
+    fig.tight_layout()
+    return fig
+def call_perspective_api(texts_df, column_name):#, s):
+    headers = {
+        "content-type": "application/json",
+    }
+    req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
+    texts = texts_df[column_name].values
+    for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
+        data = {
+            "comment": {"text": text},
+            "languages": ["en"],
+            "requestedAttributes": REQUESTED_ATTRIBUTES
+        }
+        time.sleep(1)
+        try:
+            req_response = requests.post(PERSPECTIVE_URL, json=data, headers=headers)
+        except Exception as e:
+            print(e)
+            return req_att_scores
+        if req_response.ok:
+            response = req_response.json()
+            # logger.info("Perspective API response is:")
+            # logger.info(response)
+            if ATT_SCORE in response:
+                for req_att in REQUESTED_ATTRIBUTES:
+                    if req_att in response[ATT_SCORE]:
+                        att_score = response[ATT_SCORE][req_att][SUM_SCORE]["value"]
+                        req_att_scores[req_att].append(att_score)
+                    else:
+                        req_att_scores[req_att].append(0)
+            else:
+                # logger.error(
+                #     "Unexpected response format from Perspective API."
+                # )
+                raise ValueError(req_response)
+        else:
+            try:
+                req_response.raise_for_status()
+            except Exception as e:
+                print(e)
+                return req_att_scores
+        if i % 10 == 0:
+            plot_toxicity(req_att_scores)
+            yield plt.gcf(), pd.DataFrame()
+    plot_toxicity(req_att_scores)
+    yield plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
+# @spaces.GPU
 def run_quality_check(dataset, column, batch_size, num_examples):
     info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
     if "error" in info_resp:
+        yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
         return
     config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
     split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
         try:
             data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
         except Exception as error:
+            yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
             return
     texts = data[column].to_list()
+    texts_sample = data.sample(20, shuffle=True, seed=16).to_pandas()
     # batch_size = 100
     predictions, texts_processed = [], []
     num_examples = min(len(texts), num_examples)
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
+        yield {"check in progress...": (i+batch_size) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure(), pd.DataFrame()
     with multiprocessing.Pool(processes=8) as pool:
         props = pool.map(proportion_non_ascii, texts)
     plt.xlabel('Proportion of non-ASCII characters')
     plt.ylabel('Number of texts')
+    yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf(), texts_sample
 with gr.Blocks() as demo:
     gr.Markdown(
     # non_ascii_hist = gr.DataFrame(visible=False)
     non_ascii_hist = gr.Plot()
+    texts_sample_df = gr.DataFrame(visible=False)
+    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[progress_bar, plot, df_low, df_medium, df_high, non_ascii_hist, texts_sample_df])
+    gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")
+    toxicity_hist = gr.Plot()
+    with gr.Accordion("Explore examples with toxicity scores:", open=False):
+        toxicity_df = gr.DataFrame()
+    gr_toxicity_btn.click(call_perspective_api, inputs=[texts_sample_df, text_column], outputs=[toxicity_hist, toxicity_df])
 demo.launch()