Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna commited on Sep 25, 2024

Commit

7d66c17

1 Parent(s): b3d9c4b

update text

Browse files

Files changed (1) hide show

app.py +11 -11

app.py CHANGED Viewed

@@ -82,7 +82,7 @@ def plot_and_df(texts, preds):
 def get_first_parquet_filename(dataset, config, split):
-    parquet_resp = session.get(f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}", timeout=10).json()
     if "error" in parquet_resp:
         raise ValueError(parquet_resp["error"])
     first_parquet_file_url = [file for file in parquet_resp["parquet_files"] if file["split"] == split][0]["url"]
@@ -217,11 +217,13 @@ def call_perspective_api(texts_df, column_name, nested_column_name, dataset, con
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        # 💫 Dataset Quality Checker 💫
-        This space:
-         * uses [NVIDIA's quality classifier model](https://huggingface.co/nvidia/quality-classifier-deberta)
-        on a subset of any text dataset on the Hub to give a quick glance on the quality of texts.
-         * uses [Perspective](https://perspectiveapi.com/how-it-works/) to check toxicity of some random samples
         ## Select dataset and text column
         """
     )
@@ -235,7 +237,6 @@ with gr.Blocks() as demo:
         subset_dropdown = gr.Dropdown(label="Subset", visible=False)
         split_dropdown = gr.Dropdown(label="Split", visible=False)
-    # config_name = "default"  # TODO: user input
     with gr.Accordion("Dataset preview", open=False):
         @gr.render(inputs=[dataset_name, subset_dropdown, split_dropdown])
         def embed(name, subset, split):
@@ -261,7 +262,7 @@ with gr.Blocks() as demo:
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
                 nested_text_column_dropdown: gr.Dropdown(visible=False)
             }
-        info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=10).json()
         if "error" in info_resp:
             return {
                 subset_dropdown: gr.Dropdown(visible=False),
@@ -285,7 +286,7 @@ with gr.Blocks() as demo:
             return {
                 subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
                 split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
-                text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name",),
                 nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
         logging.info(nested_text_features)
@@ -364,8 +365,7 @@ with gr.Blocks() as demo:
     gr.Markdown("""## Explore toxicity
     Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
     """)
-    # checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
-    gr_toxicity_btn = gr.Button("Run Perpspective API")
     toxicity_progress_bar = gr.Label(show_label=False)
     toxicity_hist = gr.Plot()
     with gr.Accordion("Explore examples with toxicity scores:", open=False):

 def get_first_parquet_filename(dataset, config, split):
+    parquet_resp = session.get(f"https://datasets-server.huggingface.co/parquet?dataset={dataset}&config={config}", timeout=20).json()
     if "error" in parquet_resp:
         raise ValueError(parquet_resp["error"])
     first_parquet_file_url = [file for file in parquet_resp["parquet_files"] if file["split"] == split][0]["url"]
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # 📈 Data Quality Checker 📉
+        This space gives some instruments to have a quick glance at the quality of a text dataset.
+         * It uses [NVIDIA's quality classifier model](https://huggingface.co/nvidia/quality-classifier-deberta)
+        on a small subset of texts.
+         * It uses [Perspective](https://perspectiveapi.com/how-it-works/) API to check toxicity of 100 random dataset texts
         ## Select dataset and text column
         """
     )
         subset_dropdown = gr.Dropdown(label="Subset", visible=False)
         split_dropdown = gr.Dropdown(label="Split", visible=False)
     with gr.Accordion("Dataset preview", open=False):
         @gr.render(inputs=[dataset_name, subset_dropdown, split_dropdown])
         def embed(name, subset, split):
                 text_column_dropdown: gr.Dropdown(label="Text column name"),
                 nested_text_column_dropdown: gr.Dropdown(visible=False)
             }
+        info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=20).json()
         if "error" in info_resp:
             return {
                 subset_dropdown: gr.Dropdown(visible=False),
             return {
                 subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
                 split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
+                text_column_dropdown: gr.Dropdown(choices=text_features + nested_text_features, label="Text column name"),
                 nested_text_column_dropdown: gr.Dropdown(visible=False),
             }
         logging.info(nested_text_features)
     gr.Markdown("""## Explore toxicity
     Run [Perspective](https://perspectiveapi.com/how-it-works/) on 100 random samples to check toxicity
     """)
+    gr_toxicity_btn = gr.Button("Run Perpspective")
     toxicity_progress_bar = gr.Label(show_label=False)
     toxicity_hist = gr.Plot()
     with gr.Accordion("Explore examples with toxicity scores:", open=False):