polinaeterna HF staff commited on
Commit
46c2a69
β€’
1 Parent(s): 73bc7cb
Files changed (1) hide show
  1. app.py +44 -43
app.py CHANGED
@@ -91,6 +91,47 @@ def plot_and_df(texts, preds):
91
  )
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
95
  PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
96
  REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY": {},
@@ -120,6 +161,7 @@ def call_perspective_api(texts_df, column_name):#, s):
120
  req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
121
 
122
  texts = texts_df[column_name].values
 
123
  for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
124
  data = {
125
  "comment": {"text": text},
@@ -157,51 +199,10 @@ def call_perspective_api(texts_df, column_name):#, s):
157
  return req_att_scores
158
  if i % 10 == 0:
159
  plot_toxicity(req_att_scores)
160
- yield plt.gcf(), pd.DataFrame()
161
 
162
  plot_toxicity(req_att_scores)
163
- yield plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
164
-
165
-
166
- @spaces.GPU
167
- def run_quality_check(dataset, column, batch_size, num_examples):
168
- info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
169
- if "error" in info_resp:
170
- yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
171
- return
172
- config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
173
- split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
174
- iter(info_resp["dataset_info"][config]["splits"]))
175
- try:
176
- data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
177
- except pl.exceptions.ComputeError:
178
- try:
179
- data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
180
- except Exception as error:
181
- yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
182
- return
183
- texts = data[column].to_list()
184
- texts_sample = data.sample(20, shuffle=True, seed=16).to_pandas()
185
- # batch_size = 100
186
- predictions, texts_processed = [], []
187
- num_examples = min(len(texts), num_examples)
188
- for i in range(0, num_examples, batch_size):
189
- batch_texts = texts[i:i+batch_size]
190
- batch_predictions = predict(batch_texts)
191
- predictions.extend(batch_predictions)
192
- texts_processed.extend(batch_texts)
193
- yield {"check in progress...": (i+batch_size) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure(), pd.DataFrame()
194
-
195
- with multiprocessing.Pool(processes=8) as pool:
196
- props = pool.map(proportion_non_ascii, texts)
197
-
198
- # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
199
- plt.hist(props, bins=20, range=(0., 1.))
200
- plt.title('Histogram of proportion of non-ASCII characters')
201
- plt.xlabel('Proportion of non-ASCII characters')
202
- plt.ylabel('Number of texts')
203
-
204
- yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf(), texts_sample
205
 
206
 
207
  with gr.Blocks() as demo:
 
91
  )
92
 
93
 
94
+ @spaces.GPU
95
+ def run_quality_check(dataset, column, batch_size, num_examples):
96
+ info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
97
+ if "error" in info_resp:
98
+ yield "❌ " + info_resp["error"], gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
99
+ return
100
+ config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
101
+ split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(
102
+ iter(info_resp["dataset_info"][config]["splits"]))
103
+ try:
104
+ data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column])
105
+ except pl.exceptions.ComputeError:
106
+ try:
107
+ data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column])
108
+ except Exception as error:
109
+ yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), plt.Figure(), pd.DataFrame(),
110
+ return
111
+ texts = data[column].to_list()
112
+ texts_sample = data.sample(100, shuffle=True, seed=16).to_pandas()
113
+ # batch_size = 100
114
+ predictions, texts_processed = [], []
115
+ num_examples = min(len(texts), num_examples)
116
+ for i in range(0, num_examples, batch_size):
117
+ batch_texts = texts[i:i+batch_size]
118
+ batch_predictions = predict(batch_texts)
119
+ predictions.extend(batch_predictions)
120
+ texts_processed.extend(batch_texts)
121
+ yield {"check in progress...": min(i+batch_size, num_examples) / num_examples}, *plot_and_df(texts_processed, predictions), plt.Figure(), pd.DataFrame()
122
+
123
+ with multiprocessing.Pool(processes=8) as pool:
124
+ props = pool.map(proportion_non_ascii, texts)
125
+
126
+ # non_ascii_df = pd.DataFrame.from_dict({"prop_non_ascii": props, "text": texts})
127
+ plt.hist(props, bins=20, range=(0., 1.))
128
+ plt.title('Histogram of proportion of non-ASCII characters')
129
+ plt.xlabel('Proportion of non-ASCII characters')
130
+ plt.ylabel('Number of texts')
131
+
132
+ yield {"finished": 1.}, *plot_and_df(texts_processed, predictions), plt.gcf(), texts_sample
133
+
134
+
135
  PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
136
  PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
137
  REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY": {},
 
161
  req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
162
 
163
  texts = texts_df[column_name].values
164
+ n_samples = len(texts)
165
  for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
166
  data = {
167
  "comment": {"text": text},
 
199
  return req_att_scores
200
  if i % 10 == 0:
201
  plot_toxicity(req_att_scores)
202
+ yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame()
203
 
204
  plot_toxicity(req_att_scores)
205
+ yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
 
208
  with gr.Blocks() as demo: