Spaces:
Running
on
Zero
Running
on
Zero
Commit
β’
e5960a0
1
Parent(s):
4105710
set max examples manually
Browse files
app.py
CHANGED
@@ -56,19 +56,20 @@ def plot_and_df(texts, preds):
|
|
56 |
)
|
57 |
|
58 |
|
59 |
-
def run_quality_check(dataset, column, batch_size):
|
60 |
config = "default"
|
61 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
|
62 |
texts = data[column].to_list()
|
63 |
# batch_size = 100
|
64 |
predictions, texts_processed = [], []
|
65 |
-
for i in range(0, min(len(texts),
|
66 |
batch_texts = texts[i:i+batch_size]
|
67 |
batch_predictions = predict(batch_texts)
|
68 |
predictions.extend(batch_predictions)
|
69 |
texts_processed.extend(batch_texts)
|
70 |
yield plot_and_df(texts_processed, predictions)
|
71 |
|
|
|
72 |
with gr.Blocks() as demo:
|
73 |
gr.Markdown("# π« Dataset Quality Checker π«")
|
74 |
dataset_name = HuggingfaceHubSearch(
|
@@ -91,11 +92,12 @@ with gr.Blocks() as demo:
|
|
91 |
return gr.HTML(value=html_code)
|
92 |
text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
|
93 |
batch_size = gr.Number(100, label="Batch size")
|
|
|
94 |
gr_check_btn = gr.Button("Check Dataset")
|
95 |
plot = gr.BarPlot()
|
96 |
|
97 |
with gr.Accordion("Explore some individual examples for each class", open=False):
|
98 |
df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
|
99 |
-
gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size], outputs=[plot, df_low, df_medium, df_high])
|
100 |
|
101 |
demo.launch()
|
|
|
56 |
)
|
57 |
|
58 |
|
59 |
+
def run_quality_check(dataset, column, batch_size, num_examples):
|
60 |
config = "default"
|
61 |
data = pl.read_parquet(f"hf://datasets/{dataset}@~parquet/{config}/train/0000.parquet", columns=[column])
|
62 |
texts = data[column].to_list()
|
63 |
# batch_size = 100
|
64 |
predictions, texts_processed = [], []
|
65 |
+
for i in range(0, min(len(texts), num_examples), batch_size):
|
66 |
batch_texts = texts[i:i+batch_size]
|
67 |
batch_predictions = predict(batch_texts)
|
68 |
predictions.extend(batch_predictions)
|
69 |
texts_processed.extend(batch_texts)
|
70 |
yield plot_and_df(texts_processed, predictions)
|
71 |
|
72 |
+
|
73 |
with gr.Blocks() as demo:
|
74 |
gr.Markdown("# π« Dataset Quality Checker π«")
|
75 |
dataset_name = HuggingfaceHubSearch(
|
|
|
92 |
return gr.HTML(value=html_code)
|
93 |
text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
|
94 |
batch_size = gr.Number(100, label="Batch size")
|
95 |
+
num_examples = gr.Number(1000, label="Num examples to check")
|
96 |
gr_check_btn = gr.Button("Check Dataset")
|
97 |
plot = gr.BarPlot()
|
98 |
|
99 |
with gr.Accordion("Explore some individual examples for each class", open=False):
|
100 |
df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
|
101 |
+
gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[plot, df_low, df_medium, df_high])
|
102 |
|
103 |
demo.launch()
|