Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

polinaeterna commited on Aug 28, 2024

Commit

e5960a0

1 Parent(s): 4105710

set max examples manually

Files changed (1) hide show

app.py CHANGED Viewed

@@ -56,19 +56,20 @@ def plot_and_df(texts, preds):
         )
-def run_quality_check(dataset, column, batch_size):
     config = "default"
     data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
     texts = data[column].to_list()
     # batch_size = 100
     predictions, texts_processed = [], []
-    for i in range(0, min(len(texts), batch_size*5), batch_size):
         batch_texts = texts[i:i+batch_size]
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
         yield plot_and_df(texts_processed, predictions)
 with gr.Blocks() as demo:
     gr.Markdown("# 💫 Dataset Quality Checker 💫")
     dataset_name = HuggingfaceHubSearch(
@@ -91,11 +92,12 @@ with gr.Blocks() as demo:
         return gr.HTML(value=html_code)
     text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
     batch_size = gr.Number(100, label="Batch size")
     gr_check_btn = gr.Button("Check Dataset")
     plot = gr.BarPlot()
     with gr.Accordion("Explore some individual examples for each class", open=False):
         df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
-    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size], outputs=[plot, df_low, df_medium, df_high])
 demo.launch()

         )
+def run_quality_check(dataset, column, batch_size, num_examples):
     config = "default"
     data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
     texts = data[column].to_list()
     # batch_size = 100
     predictions, texts_processed = [], []
+    for i in range(0, min(len(texts), num_examples), batch_size):
         batch_texts = texts[i:i+batch_size]
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
         yield plot_and_df(texts_processed, predictions)
 with gr.Blocks() as demo:
     gr.Markdown("# 💫 Dataset Quality Checker 💫")
     dataset_name = HuggingfaceHubSearch(
         return gr.HTML(value=html_code)
     text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
     batch_size = gr.Number(100, label="Batch size")
+    num_examples = gr.Number(1000, label="Num examples to check")
     gr_check_btn = gr.Button("Check Dataset")
     plot = gr.BarPlot()
     with gr.Accordion("Explore some individual examples for each class", open=False):
         df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
+    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[plot, df_low, df_medium, df_high])
 demo.launch()