polinaeterna HF staff commited on
Commit
3aad6e9
β€’
1 Parent(s): c0e4fc0

add samples

Browse files
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -47,12 +47,17 @@ def predict(texts: list[str]):
47
  def run_quality_check(dataset, column, n_samples):
48
  config = "default"
49
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
50
- texts = data[column].to_list()
51
  predictions = predict(texts[:n_samples])
 
52
  counts = pd.DataFrame({"quality": predictions}).value_counts().to_frame()
53
  counts.reset_index(inplace=True)
54
- return gr.BarPlot(counts, x="quality", y="count")
55
-
 
 
 
 
56
 
57
  with gr.Blocks() as demo:
58
  gr.Markdown("# πŸ’« Dataset Quality Checker πŸ’«")
@@ -62,12 +67,6 @@ with gr.Blocks() as demo:
62
  search_type="dataset",
63
  value="fka/awesome-chatgpt-prompts",
64
  )
65
- # dataset_name = HuggingfaceHubSearch(
66
- # label="Hub Dataset ID",
67
- # placeholder="Search for dataset id on Huggingface",
68
- # search_type="dataset",
69
- # value="HuggingFaceFW/fineweb",
70
- # )
71
  # config_name = "default" # TODO: user input
72
  @gr.render(inputs=dataset_name)
73
  def embed(name):
@@ -84,7 +83,9 @@ with gr.Blocks() as demo:
84
  n_samples = gr.Number(label="Num first samples to run check")
85
  gr_check_btn = gr.Button("Check Dataset")
86
  plot = gr.BarPlot()
87
- # df = gr.DataFrame(visible=False)
88
- gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[plot])
 
 
89
 
90
  demo.launch()
 
47
  def run_quality_check(dataset, column, n_samples):
48
  config = "default"
49
  data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
50
+ texts = data[column].to_list()[:n_samples]
51
  predictions = predict(texts[:n_samples])
52
+ texts_df = pd.DataFrame({"quality": predictions, "text": texts})
53
  counts = pd.DataFrame({"quality": predictions}).value_counts().to_frame()
54
  counts.reset_index(inplace=True)
55
+ return (
56
+ gr.BarPlot(counts, x="quality", y="count"),
57
+ texts_df[texts_df["quality"] == "Low"][:20],
58
+ texts_df[texts_df["quality"] == "Medium"][:20],
59
+ texts_df[texts_df["quality"] == "High"][:20],
60
+ )
61
 
62
  with gr.Blocks() as demo:
63
  gr.Markdown("# πŸ’« Dataset Quality Checker πŸ’«")
 
67
  search_type="dataset",
68
  value="fka/awesome-chatgpt-prompts",
69
  )
 
 
 
 
 
 
70
  # config_name = "default" # TODO: user input
71
  @gr.render(inputs=dataset_name)
72
  def embed(name):
 
83
  n_samples = gr.Number(label="Num first samples to run check")
84
  gr_check_btn = gr.Button("Check Dataset")
85
  plot = gr.BarPlot()
86
+
87
+ with gr.Accordion("Explore some individual examples for each class", open=False):
88
+ df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
89
+ gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, n_samples], outputs=[plot, df_low, df_medium, df_high])
90
 
91
  demo.launch()