Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna HF staff commited on Sep 9

Commit

46c2a69

•

1 Parent(s): 73bc7cb

fix

Browse files

Files changed (1) hide show

app.py +44 -43

app.py CHANGED Viewed

@@ -91,6 +91,47 @@ def plot_and_df(texts, preds):
         )
 PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
 PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
 REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY": {},
@@ -120,6 +161,7 @@ def call_perspective_api(texts_df, column_name):#, s):
     req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
     texts = texts_df[column_name].values
     for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
         data = {
             "comment": {"text": text},
@@ -157,51 +199,10 @@ def call_perspective_api(texts_df, column_name):#, s):
                 return req_att_scores
         if i % 10 == 0:
             plot_toxicity(req_att_scores)
-            yield plt.gcf(), pd.DataFrame()
     plot_toxicity(req_att_scores)
-    yield plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
-@spaces.GPU
-def run_quality_check(dataset, column, batch_size, num_examples):
-    info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
-    if "error" in info_resp:
-        yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
-        return
-    config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
-    split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
-        iter(info_resp["dataset_info"][config]["splits"]))
-    try:
-        data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
-    except pl.exceptions.ComputeError:
-        try:
-            data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
-        except Exception as error:
-            yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
-            return
-    texts = data[column].to_list()
-    texts_sample = data.sample(20, shuffle=True, seed=16).to_pandas()
-    # batch_size = 100
-    predictions, texts_processed = [], []
-    num_examples = min(len(texts), num_examples)
-    for i in range(0, num_examples, batch_size):
-        batch_texts = texts[i:i+batch_size]
-        batch_predictions = predict(batch_texts)
-        predictions.extend(batch_predictions)
-        texts_processed.extend(batch_texts)
-        yield {"check in progress...": (i+batch_size) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure(), pd.DataFrame()
-    with multiprocessing.Pool(processes=8) as pool:
-        props = pool.map(proportion_non_ascii, texts)
-    # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
-    plt.hist(props, bins=20, range=(0., 1.))
-    plt.title('Histogram of proportion of non-ASCII characters')
-    plt.xlabel('Proportion of non-ASCII characters')
-    plt.ylabel('Number of texts')
-    yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf(), texts_sample
 with gr.Blocks() as demo:

         )
+@spaces.GPU
+def run_quality_check(dataset, column, batch_size, num_examples):
+    info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
+    if "error" in info_resp:
+        yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
+        return
+    config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
+    split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
+        iter(info_resp["dataset_info"][config]["splits"]))
+    try:
+        data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
+    except pl.exceptions.ComputeError:
+        try:
+            data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
+        except Exception as error:
+            yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
+            return
+    texts = data[column].to_list()
+    texts_sample = data.sample(100, shuffle=True, seed=16).to_pandas()
+    # batch_size = 100
+    predictions, texts_processed = [], []
+    num_examples = min(len(texts), num_examples)
+    for i in range(0, num_examples, batch_size):
+        batch_texts = texts[i:i+batch_size]
+        batch_predictions = predict(batch_texts)
+        predictions.extend(batch_predictions)
+        texts_processed.extend(batch_texts)
+        yield {"check in progress...": min(i+batch_size, num_examples) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure(), pd.DataFrame()
+    with multiprocessing.Pool(processes=8) as pool:
+        props = pool.map(proportion_non_ascii, texts)
+    # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
+    plt.hist(props, bins=20, range=(0., 1.))
+    plt.title('Histogram of proportion of non-ASCII characters')
+    plt.xlabel('Proportion of non-ASCII characters')
+    plt.ylabel('Number of texts')
+    yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf(), texts_sample
 PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
 PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
 REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY": {},
     req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
     texts = texts_df[column_name].values
+    n_samples = len(texts)
     for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
         data = {
             "comment": {"text": text},
                 return req_att_scores
         if i % 10 == 0:
             plot_toxicity(req_att_scores)
+            yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame()
     plot_toxicity(req_att_scores)
+    yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
 with gr.Blocks() as demo: