Spaces:
Running
on
Zero
Running
on
Zero
Commit
β’
46c2a69
1
Parent(s):
73bc7cb
fix
Browse files
app.py
CHANGED
@@ -91,6 +91,47 @@ def plot_and_df(texts, preds):
|
|
91 |
)
|
92 |
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
|
95 |
PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
|
96 |
REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY": {},
|
@@ -120,6 +161,7 @@ def call_perspective_api(texts_df, column_name):#, s):
|
|
120 |
req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
|
121 |
|
122 |
texts = texts_df[column_name].values
|
|
|
123 |
for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
|
124 |
data = {
|
125 |
"comment": {"text": text},
|
@@ -157,51 +199,10 @@ def call_perspective_api(texts_df, column_name):#, s):
|
|
157 |
return req_att_scores
|
158 |
if i % 10 == 0:
|
159 |
plot_toxicity(req_att_scores)
|
160 |
-
yield plt.gcf(), pd.DataFrame()
|
161 |
|
162 |
plot_toxicity(req_att_scores)
|
163 |
-
yield plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
|
164 |
-
|
165 |
-
|
166 |
-
@spaces.GPU
|
167 |
-
def run_quality_check(dataset, column, batch_size, num_examples):
|
168 |
-
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
169 |
-
if "error" in info_resp:
|
170 |
-
yield "β " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
|
171 |
-
return
|
172 |
-
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
173 |
-
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
|
174 |
-
iter(info_resp["dataset_info"][config]["splits"]))
|
175 |
-
try:
|
176 |
-
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
|
177 |
-
except pl.exceptions.ComputeError:
|
178 |
-
try:
|
179 |
-
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
|
180 |
-
except Exception as error:
|
181 |
-
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
|
182 |
-
return
|
183 |
-
texts = data[column].to_list()
|
184 |
-
texts_sample = data.sample(20, shuffle=True, seed=16).to_pandas()
|
185 |
-
# batch_size = 100
|
186 |
-
predictions, texts_processed = [], []
|
187 |
-
num_examples = min(len(texts), num_examples)
|
188 |
-
for i in range(0, num_examples, batch_size):
|
189 |
-
batch_texts = texts[i:i+batch_size]
|
190 |
-
batch_predictions = predict(batch_texts)
|
191 |
-
predictions.extend(batch_predictions)
|
192 |
-
texts_processed.extend(batch_texts)
|
193 |
-
yield {"check in progress...": (i+batch_size) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure(), pd.DataFrame()
|
194 |
-
|
195 |
-
with multiprocessing.Pool(processes=8) as pool:
|
196 |
-
props = pool.map(proportion_non_ascii, texts)
|
197 |
-
|
198 |
-
# non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
|
199 |
-
plt.hist(props, bins=20, range=(0., 1.))
|
200 |
-
plt.title('Histogram of proportion of non-ASCII characters')
|
201 |
-
plt.xlabel('Proportion of non-ASCII characters')
|
202 |
-
plt.ylabel('Number of texts')
|
203 |
-
|
204 |
-
yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf(), texts_sample
|
205 |
|
206 |
|
207 |
with gr.Blocks() as demo:
|
|
|
91 |
)
|
92 |
|
93 |
|
94 |
+
@spaces.GPU
|
95 |
+
def run_quality_check(dataset, column, batch_size, num_examples):
|
96 |
+
info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
97 |
+
if "error" in info_resp:
|
98 |
+
yield "β " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
|
99 |
+
return
|
100 |
+
config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
|
101 |
+
split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
|
102 |
+
iter(info_resp["dataset_info"][config]["splits"]))
|
103 |
+
try:
|
104 |
+
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
|
105 |
+
except pl.exceptions.ComputeError:
|
106 |
+
try:
|
107 |
+
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
|
108 |
+
except Exception as error:
|
109 |
+
yield f"β {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
|
110 |
+
return
|
111 |
+
texts = data[column].to_list()
|
112 |
+
texts_sample = data.sample(100, shuffle=True, seed=16).to_pandas()
|
113 |
+
# batch_size = 100
|
114 |
+
predictions, texts_processed = [], []
|
115 |
+
num_examples = min(len(texts), num_examples)
|
116 |
+
for i in range(0, num_examples, batch_size):
|
117 |
+
batch_texts = texts[i:i+batch_size]
|
118 |
+
batch_predictions = predict(batch_texts)
|
119 |
+
predictions.extend(batch_predictions)
|
120 |
+
texts_processed.extend(batch_texts)
|
121 |
+
yield {"check in progress...": min(i+batch_size, num_examples) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure(), pd.DataFrame()
|
122 |
+
|
123 |
+
with multiprocessing.Pool(processes=8) as pool:
|
124 |
+
props = pool.map(proportion_non_ascii, texts)
|
125 |
+
|
126 |
+
# non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
|
127 |
+
plt.hist(props, bins=20, range=(0., 1.))
|
128 |
+
plt.title('Histogram of proportion of non-ASCII characters')
|
129 |
+
plt.xlabel('Proportion of non-ASCII characters')
|
130 |
+
plt.ylabel('Number of texts')
|
131 |
+
|
132 |
+
yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf(), texts_sample
|
133 |
+
|
134 |
+
|
135 |
PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
|
136 |
PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
|
137 |
REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY": {},
|
|
|
161 |
req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
|
162 |
|
163 |
texts = texts_df[column_name].values
|
164 |
+
n_samples = len(texts)
|
165 |
for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
|
166 |
data = {
|
167 |
"comment": {"text": text},
|
|
|
199 |
return req_att_scores
|
200 |
if i % 10 == 0:
|
201 |
plot_toxicity(req_att_scores)
|
202 |
+
yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame()
|
203 |
|
204 |
plot_toxicity(req_att_scores)
|
205 |
+
yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
|
208 |
with gr.Blocks() as demo:
|