polinaeterna HF staff commited on
Commit
7d66c17
β€’
1 Parent(s): b3d9c4b

update text

Browse files
Files changed (1) hide show
  1. app.py +11 -11
app.py CHANGED
@@ -82,7 +82,7 @@ def plot_and_df(texts, preds):
82
 
83
 
84
  def get_first_parquet_filename(dataset, config, split):
85
- parquet_resp = session.get(f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}", timeout=10).json()
86
  if "error" in parquet_resp:
87
  raise ValueError(parquet_resp["error"])
88
  first_parquet_file_url = [file for file in parquet_resp["parquet_files"] if file["split"] == split][0]["url"]
@@ -217,11 +217,13 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
217
  with gr.Blocks() as demo:
218
  gr.Markdown(
219
  """
220
- # πŸ’« Dataset Quality Checker πŸ’«
221
- This space:
222
- * uses [NVIDIA's quality classifier model](https://huggingface.co/nvidia/quality-classifier-deberta)
223
- on a subset of any text dataset on the Hub to give a quick glance on the quality of texts.
224
- * uses [Perspective](https://perspectiveapi.com/how-it-works/) to check toxicity of some random samples
 
 
225
  ## Select dataset and text column
226
  """
227
  )
@@ -235,7 +237,6 @@ with gr.Blocks() as demo:
235
  subset_dropdown = gr.Dropdown(label="Subset", visible=False)
236
  split_dropdown = gr.Dropdown(label="Split", visible=False)
237
 
238
- # config_name = "default" # TODO: user input
239
  with gr.Accordion("Dataset preview", open=False):
240
  @gr.render(inputs=[dataset_name, subset_dropdown, split_dropdown])
241
  def embed(name, subset, split):
@@ -261,7 +262,7 @@ with gr.Blocks() as demo:
261
  text_column_dropdown: gr.Dropdown(label="Text column name"),
262
  nested_text_column_dropdown: gr.Dropdown(visible=False)
263
  }
264
- info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=10).json()
265
  if "error" in info_resp:
266
  return {
267
  subset_dropdown: gr.Dropdown(visible=False),
@@ -285,7 +286,7 @@ with gr.Blocks() as demo:
285
  return {
286
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
287
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
288
- text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name",),
289
  nested_text_column_dropdown: gr.Dropdown(visible=False),
290
  }
291
  logging.info(nested_text_features)
@@ -364,8 +365,7 @@ with gr.Blocks() as demo:
364
  gr.Markdown("""## Explore toxicity
365
  Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
366
  """)
367
- # checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
368
- gr_toxicity_btn = gr.Button("Run Perpspective API")
369
  toxicity_progress_bar = gr.Label(show_label=False)
370
  toxicity_hist = gr.Plot()
371
  with gr.Accordion("Explore examples with toxicity scores:", open=False):
 
82
 
83
 
84
  def get_first_parquet_filename(dataset, config, split):
85
+ parquet_resp = session.get(f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}", timeout=20).json()
86
  if "error" in parquet_resp:
87
  raise ValueError(parquet_resp["error"])
88
  first_parquet_file_url = [file for file in parquet_resp["parquet_files"] if file["split"] == split][0]["url"]
 
217
  with gr.Blocks() as demo:
218
  gr.Markdown(
219
  """
220
+ # πŸ“ˆ Data Quality Checker πŸ“‰
221
+
222
+ This space gives some instruments to have a quick glance at the quality of a text dataset.
223
+ * It uses [NVIDIA's quality classifier model](https://huggingface.co/nvidia/quality-classifier-deberta)
224
+ on a small subset of texts.
225
+ * It uses [Perspective](https://perspectiveapi.com/how-it-works/) API to check toxicity of 100 random dataset texts
226
+
227
  ## Select dataset and text column
228
  """
229
  )
 
237
  subset_dropdown = gr.Dropdown(label="Subset", visible=False)
238
  split_dropdown = gr.Dropdown(label="Split", visible=False)
239
 
 
240
  with gr.Accordion("Dataset preview", open=False):
241
  @gr.render(inputs=[dataset_name, subset_dropdown, split_dropdown])
242
  def embed(name, subset, split):
 
262
  text_column_dropdown: gr.Dropdown(label="Text column name"),
263
  nested_text_column_dropdown: gr.Dropdown(visible=False)
264
  }
265
+ info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=20).json()
266
  if "error" in info_resp:
267
  return {
268
  subset_dropdown: gr.Dropdown(visible=False),
 
286
  return {
287
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
288
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
289
+ text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name"),
290
  nested_text_column_dropdown: gr.Dropdown(visible=False),
291
  }
292
  logging.info(nested_text_features)
 
365
  gr.Markdown("""## Explore toxicity
366
  Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
367
  """)
368
+ gr_toxicity_btn = gr.Button("Run Perpspective")
 
369
  toxicity_progress_bar = gr.Label(show_label=False)
370
  toxicity_hist = gr.Plot()
371
  with gr.Accordion("Explore examples with toxicity scores:", open=False):