polinaeterna HF staff commited on
Commit
fd7a758
β€’
1 Parent(s): 284cae9

add nested texts

Browse files
Files changed (1) hide show
  1. app.py +72 -33
app.py CHANGED
@@ -107,7 +107,9 @@ def run_quality_check(dataset, config, split, column, batch_size, num_examples):
107
  logging.info("Data fetched.")
108
 
109
  data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
110
- texts = [text[:10000] for text in data_sample[column].to_list()]
 
 
111
  predictions, texts_processed = [], []
112
  num_examples = min(len(texts), num_examples)
113
  for i in range(0, num_examples, batch_size):
@@ -144,7 +146,7 @@ def plot_toxicity(scores):
144
 
145
  return fig
146
 
147
- def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_check=False):
148
  headers = {
149
  "content-type": "application/json",
150
  }
@@ -154,21 +156,23 @@ def call_perspective_api(texts_df, column_name, dataset, config, split):#, full_
154
  if texts_df.values.tolist() == [['', '', '']]:
155
  logging.info(f"Fetching data for {dataset=} {config=} {split=} {column_name=}")
156
  try:
157
- texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}/0000.parquet", columns=[column_name])
158
- except pl.exceptions.ComputeError:
159
- try:
160
- texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/partial-{split}/0000.parquet", columns=[column_name])
161
- except pl.exceptions.ComputeError:
162
- try:
163
- texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/{split}-part0/0000.parquet", columns=[column_name])
164
- except Exception as error:
165
- yield f"❌ {error}", plt.gcf(), pd.DataFrame(),
166
- return
 
167
  logging.info("Data fetched.")
168
  texts_df = texts_df.to_pandas()
169
 
170
- # texts = texts_df.sample(100, seed=16)[column_name].values if not full_check else texts_df[column_name].values
171
  texts = texts_df.sample(100, random_state=16)[column_name].values if texts_df.shape[0] > 100 else texts_df[column_name].values
 
 
172
 
173
  n_samples = len(texts)
174
  for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
@@ -245,45 +249,80 @@ with gr.Blocks() as demo:
245
  """
246
  return gr.HTML(value=html_code)
247
 
248
- text_column_dropdown = gr.Dropdown(label="Text column name", info="Text colum name to check. ")
 
 
249
 
250
- def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str):
251
  if "/" not in dataset.strip().strip("/"):
252
  return {
253
  subset_dropdown: gr.Dropdown(visible=False),
254
  split_dropdown: gr.Dropdown(visible=False),
255
- text_column_dropdown: gr.Dropdown(info="Text colum name to check (only non-nested texts are supported)"),
 
256
  }
257
  info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
258
  if "error" in info_resp:
259
  return {
260
  subset_dropdown: gr.Dropdown(visible=False),
261
  split_dropdown: gr.Dropdown(visible=False),
262
- text_column_dropdown: gr.Dropdown(label="Text column name", info="Text colum name to check (only non-nested texts are supported)")
 
263
  }
264
  subsets: list[str] = list(info_resp["dataset_info"])
265
  subset = default_subset if default_subset in subsets else subsets[0]
266
  splits: list[str] = info_resp["dataset_info"][subset]["splits"]
267
  split = default_split if default_split in splits else splits[0]
268
  features = info_resp["dataset_info"][subset]["features"]
269
- text_features = [feature_name for feature_name, feature in features.items() if isinstance(feature, dict) and feature.get("dtype") == "string"] # and feature.get("_type") == "Value"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  return {
271
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
272
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
273
- text_column_dropdown: gr.Dropdown(choices=text_features, label="Text column name", info="Text colum name to check (only non-nested texts are supported)"),
 
274
  }
275
 
276
- @dataset_name.change(inputs=[dataset_name], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
277
  def show_input_from_subset_dropdown(dataset: str) -> dict:
278
- return _resolve_dataset_selection(dataset, default_subset="default", default_split="train")
279
 
280
- @subset_dropdown.change(inputs=[dataset_name, subset_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
281
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
282
- return _resolve_dataset_selection(dataset, default_subset=subset, default_split="train")
283
 
284
- @split_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown])
285
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
286
- return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split)
 
 
 
 
287
 
288
  gr.Markdown("## Run nvidia quality classifier")
289
  batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
@@ -305,13 +344,13 @@ with gr.Blocks() as demo:
305
 
306
  gr.Examples(
307
  [
308
- ["HuggingFaceFW/fineweb-edu", "default", "train", "text", 16, 500],
309
- ["fka/awesome-chatgpt-prompts", "default", "train", "prompt", 64, 200],
310
- ["proj-persona/PersonaHub", "instruction", "train", "synthesized text", 32, 1000],
311
- ["argilla/FinePersonas-v0.1", "default", "train", "persona", 64, 1000],
312
- ["Open-Orca/OpenOrca", "default", "train", "response", 16, 500],
313
  ],
314
- [dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, batch_size, num_examples],
315
  [progress_bar, plot, df_low, df_medium, df_high, texts_df],
316
  fn=run_quality_check,
317
  run_on_click=False,
@@ -320,7 +359,7 @@ with gr.Blocks() as demo:
320
 
321
  gr_check_btn.click(
322
  run_quality_check,
323
- inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, batch_size, num_examples],
324
  outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
325
  )
326
 
@@ -335,7 +374,7 @@ with gr.Blocks() as demo:
335
  toxicity_df = gr.DataFrame()
336
  gr_toxicity_btn.click(
337
  call_perspective_api,
338
- inputs=[texts_df, text_column_dropdown, dataset_name, subset_dropdown, split_dropdown],#, checkbox],
339
  outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
340
  )
341
 
 
107
  logging.info("Data fetched.")
108
 
109
  data_sample = data.sample(num_examples, seed=16) if data.shape[0] > num_examples else data
110
+ texts = data_sample[column].to_list()
111
+ if nested_column:
112
+ texts = [text[nested_column] for text in texts]
113
  predictions, texts_processed = [], []
114
  num_examples = min(len(texts), num_examples)
115
  for i in range(0, num_examples, batch_size):
 
146
 
147
  return fig
148
 
149
+ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, config, split):#, full_check=False):
150
  headers = {
151
  "content-type": "application/json",
152
  }
 
156
  if texts_df.values.tolist() == [['', '', '']]:
157
  logging.info(f"Fetching data for {dataset=} {config=} {split=} {column_name=}")
158
  try:
159
+ filename = get_first_parquet_filename(dataset, config, split)
160
+ except Exception as error:
161
+ yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
162
+ return
163
+
164
+ try:
165
+ logging.info(f"Loading hf://datasets/{dataset}@~parquet/{filename}")
166
+ texts_df = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{filename}", columns=[column_name])
167
+ except Exception as error:
168
+ yield f"❌ {error}", gr.BarPlot(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
169
+ return
170
  logging.info("Data fetched.")
171
  texts_df = texts_df.to_pandas()
172
 
 
173
  texts = texts_df.sample(100, random_state=16)[column_name].values if texts_df.shape[0] > 100 else texts_df[column_name].values
174
+ if nested_column_name:
175
+ texts = [text[nested_column_name] for text in texts]
176
 
177
  n_samples = len(texts)
178
  for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
 
249
  """
250
  return gr.HTML(value=html_code)
251
 
252
+ with gr.Row():
253
+ text_column_dropdown = gr.Dropdown(label="Text column name", info="Text colum name to check. ")
254
+ nested_text_column_dropdown = gr.Dropdown(label="Nested text key")#, visible=False)
255
 
256
+ def _resolve_dataset_selection(dataset: str, default_subset: str, default_split: str, text_feature):
257
  if "/" not in dataset.strip().strip("/"):
258
  return {
259
  subset_dropdown: gr.Dropdown(visible=False),
260
  split_dropdown: gr.Dropdown(visible=False),
261
+ text_column_dropdown: gr.Dropdown(info="Text colum name to check"),
262
+ nested_text_column_dropdown: gr.Dropdown(visible=False)
263
  }
264
  info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
265
  if "error" in info_resp:
266
  return {
267
  subset_dropdown: gr.Dropdown(visible=False),
268
  split_dropdown: gr.Dropdown(visible=False),
269
+ text_column_dropdown: gr.Dropdown(label="Text column name", info="Text colum name to check"),
270
+ nested_text_column_dropdown: gr.Dropdown(visible=False)
271
  }
272
  subsets: list[str] = list(info_resp["dataset_info"])
273
  subset = default_subset if default_subset in subsets else subsets[0]
274
  splits: list[str] = info_resp["dataset_info"][subset]["splits"]
275
  split = default_split if default_split in splits else splits[0]
276
  features = info_resp["dataset_info"][subset]["features"]
277
+
278
+ def _is_string_feature(feature):
279
+ return isinstance(feature, dict) and feature.get("dtype") == "string"
280
+
281
+ text_features = [feature_name for feature_name, feature in features.items() if _is_string_feature(feature)]
282
+ nested_features = [feature_name for feature_name, feature in features.items() if isinstance(feature, dict) and isinstance(next(iter(feature.values())), dict)]
283
+ nested_text_features = [feature_name for feature_name in nested_features if any(_is_string_feature(nested_feature) for nested_feature in features[feature_name].values())]
284
+ if not text_feature:
285
+ return {
286
+ subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
287
+ split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
288
+ text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name",
289
+ info="Text colum name to check"),
290
+ nested_text_column_dropdown: gr.Dropdown(visible=False),
291
+ }
292
+ logging.info(nested_text_features)
293
+ if text_feature in nested_text_features:
294
+ nested_keys = [feature_name for feature_name, feature in features[text_feature].items() if _is_string_feature(feature)]
295
+ return {
296
+ subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
297
+ split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
298
+ text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features,
299
+ label="Text column name",
300
+ info="Text colum name to check (only non-nested texts are supported)"),
301
+ nested_text_column_dropdown: gr.Dropdown(value=nested_keys[0], choices=nested_keys,
302
+ label="Nested text column name", visible=True)
303
+ }
304
  return {
305
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
306
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
307
+ text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name", info="Text colum name to check (only non-nested texts are supported)"),
308
+ nested_text_column_dropdown: gr.Dropdown(visible=False),
309
  }
310
 
311
+ @dataset_name.change(inputs=[dataset_name], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
312
  def show_input_from_subset_dropdown(dataset: str) -> dict:
313
+ return _resolve_dataset_selection(dataset, default_subset="default", default_split="train", text_feature=None)
314
 
315
+ @subset_dropdown.change(inputs=[dataset_name, subset_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
316
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
317
+ return _resolve_dataset_selection(dataset, default_subset=subset, default_split="train", text_feature=None)
318
 
319
+ @split_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
320
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
321
+ return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split, text_feature=None)
322
+
323
+ @text_column_dropdown.change(inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown], outputs=[subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown])
324
+ def show_input_from_text_column_dropdown(dataset: str, subset: str, split: str, text_column) -> dict:
325
+ return _resolve_dataset_selection(dataset, default_subset=subset, default_split=split, text_feature=text_column)
326
 
327
  gr.Markdown("## Run nvidia quality classifier")
328
  batch_size = gr.Slider(0, 64, 32, step=4, label="Inference batch size", info="(set this to smaller value if this space crashes.)")
 
344
 
345
  gr.Examples(
346
  [
347
+ ["HuggingFaceFW/fineweb-edu", "default", "train", "text", None, 16, 500],
348
+ # ["fka/awesome-chatgpt-prompts", "default", "train", "prompt", 64, 200],
349
+ # ["proj-persona/PersonaHub", "instruction", "train", "synthesized text", 32, 1000],
350
+ ["argilla/FinePersonas-v0.1", "default", "train", "persona", None, 64, 1000],
351
+ ["allenai/real-toxicity-prompts", "default", "train", "continuation", "text", 64, 1000],
352
  ],
353
+ [dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown, batch_size, num_examples],
354
  [progress_bar, plot, df_low, df_medium, df_high, texts_df],
355
  fn=run_quality_check,
356
  run_on_click=False,
 
359
 
360
  gr_check_btn.click(
361
  run_quality_check,
362
+ inputs=[dataset_name, subset_dropdown, split_dropdown, text_column_dropdown, nested_text_column_dropdown, batch_size, num_examples],
363
  outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
364
  )
365
 
 
374
  toxicity_df = gr.DataFrame()
375
  gr_toxicity_btn.click(
376
  call_perspective_api,
377
+ inputs=[texts_df, text_column_dropdown, nested_text_column_dropdown, dataset_name, subset_dropdown, split_dropdown],#, checkbox],
378
  outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
379
  )
380