polinaeterna HF staff commited on
Commit
e5960a0
β€’
1 Parent(s): 4105710

set max examples manually

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -56,19 +56,20 @@ def plot_and_df(texts, preds):
56
  )
57
 
58
 
59
- def run_quality_check(dataset, column, batch_size):
60
  config = "default"
61
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
62
  texts = data[column].to_list()
63
  # batch_size = 100
64
  predictions, texts_processed = [], []
65
- for i in range(0, min(len(texts), batch_size*5), batch_size):
66
  batch_texts = texts[i:i+batch_size]
67
  batch_predictions = predict(batch_texts)
68
  predictions.extend(batch_predictions)
69
  texts_processed.extend(batch_texts)
70
  yield plot_and_df(texts_processed, predictions)
71
 
 
72
  with gr.Blocks() as demo:
73
  gr.Markdown("# πŸ’« Dataset Quality Checker πŸ’«")
74
  dataset_name = HuggingfaceHubSearch(
@@ -91,11 +92,12 @@ with gr.Blocks() as demo:
91
  return gr.HTML(value=html_code)
92
  text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
93
  batch_size = gr.Number(100, label="Batch size")
 
94
  gr_check_btn = gr.Button("Check Dataset")
95
  plot = gr.BarPlot()
96
 
97
  with gr.Accordion("Explore some individual examples for each class", open=False):
98
  df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
99
- gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size], outputs=[plot, df_low, df_medium, df_high])
100
 
101
  demo.launch()
 
56
  )
57
 
58
 
59
+ def run_quality_check(dataset, column, batch_size, num_examples):
60
  config = "default"
61
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
62
  texts = data[column].to_list()
63
  # batch_size = 100
64
  predictions, texts_processed = [], []
65
+ for i in range(0, min(len(texts), num_examples), batch_size):
66
  batch_texts = texts[i:i+batch_size]
67
  batch_predictions = predict(batch_texts)
68
  predictions.extend(batch_predictions)
69
  texts_processed.extend(batch_texts)
70
  yield plot_and_df(texts_processed, predictions)
71
 
72
+
73
  with gr.Blocks() as demo:
74
  gr.Markdown("# πŸ’« Dataset Quality Checker πŸ’«")
75
  dataset_name = HuggingfaceHubSearch(
 
92
  return gr.HTML(value=html_code)
93
  text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
94
  batch_size = gr.Number(100, label="Batch size")
95
+ num_examples = gr.Number(1000, label="Num examples to check")
96
  gr_check_btn = gr.Button("Check Dataset")
97
  plot = gr.BarPlot()
98
 
99
  with gr.Accordion("Explore some individual examples for each class", open=False):
100
  df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
101
+ gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[plot, df_low, df_medium, df_high])
102
 
103
  demo.launch()