Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

asoria HF staff commited on Sep 4, 2024

Commit

7f6f34c

1 Parent(s): ba1088f

Minor details

Browse files

Files changed (2) hide show

app.py +47 -24
utils/notebook_utils.py +1 -1

app.py CHANGED Viewed

@@ -14,8 +14,8 @@ from utils.notebook_utils import (
 from dotenv import load_dotenv
 import os
-# TODOS:
-# Validate dataset type for type before generating the notebook
 # Add template for training
 load_dotenv()
@@ -76,22 +76,12 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
         rows = content["rows"]
         rows = [row["row"] for row in rows]
         first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
-        features = content["features"]
-        features_dict = {feature["name"]: feature["type"] for feature in features}
-        return features_dict, first_rows_df
     except Exception as e:
         logging.error(f"Error fetching first rows: {e}")
         raise
-def generate_eda_cells(dataset_id):
-    yield from generate_cells(dataset_id, eda_cells, "eda")
-def generate_rag_cells(dataset_id):
-    yield from generate_cells(dataset_id, rag_cells, "rag")
 def longest_string_column(df):
     longest_col = None
     max_length = 0
@@ -105,6 +95,14 @@ def longest_string_column(df):
     return longest_col
 def generate_embedding_cells(dataset_id):
     yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
@@ -131,6 +129,7 @@ def _push_to_hub(
 def generate_cells(dataset_id, cells, notebook_type="eda"):
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
@@ -139,23 +138,22 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
         return []
     if not libraries:
-        gr.Error("Dataset not compatible with pandas library.")
-        logging.error(f"Dataset not compatible with pandas library")
-        return gr.File(visible=False), gr.Row.update(visible=False)
     pandas_library = next(
         (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
         None,
     )
     if not pandas_library:
-        gr.Error("Dataset not compatible with pandas library.")
-        return []
     first_config_loading_code = pandas_library["loading_codes"][0]
     first_code = first_config_loading_code["code"]
     first_config = first_config_loading_code["config_name"]
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
-    features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
     longest_col = longest_string_column(df)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
@@ -163,17 +161,39 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
     replacements = [dataset_id, first_code, html_code, longest_col]
     has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
     has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
     cells = replace_wildcards(
         cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
     )
     generated_text = ""
     # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
     for cell in cells:
-        generated_text += cell["source"] + "\n"
         yield generated_text, ""
         if generated_text.count("\n") > 38:
             generated_text += (
-                f"## See more lines available in the generated notebook :) ......"
             )
             yield generated_text, ""
             break
@@ -181,7 +201,10 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
     create_notebook_file(cells, notebook_name=notebook_name)
     _push_to_hub(dataset_id, notebook_name)
     notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
-    yield generated_text, f"## Here you have the [generated notebook]({notebook_link})"
 with gr.Blocks(fill_height=True, fill_width=True) as demo:

 from dotenv import load_dotenv
 import os
+# TODOs:
+# Improve UI code preview
 # Add template for training
 load_dotenv()
         rows = content["rows"]
         rows = [row["row"] for row in rows]
         first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
+        return first_rows_df
     except Exception as e:
         logging.error(f"Error fetching first rows: {e}")
         raise
 def longest_string_column(df):
     longest_col = None
     max_length = 0
     return longest_col
+def generate_eda_cells(dataset_id):
+    yield from generate_cells(dataset_id, eda_cells, "eda")
+def generate_rag_cells(dataset_id):
+    yield from generate_cells(dataset_id, rag_cells, "rag")
 def generate_embedding_cells(dataset_id):
     yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
 def generate_cells(dataset_id, cells, notebook_type="eda"):
+    logging.info(f"Generating notebook for dataset {dataset_id}")
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
         return []
     if not libraries:
+        logging.error(f"Dataset not compatible with pandas library - not libraries")
+        yield "", "## ❌ This dataset is not compatible with pandas library ❌"
+        return
     pandas_library = next(
         (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
         None,
     )
     if not pandas_library:
+        logging.error("Dataset not compatible with pandas library - not pandas library")
+        yield "", "## ❌ This dataset is not compatible with pandas library ❌"
+        return
     first_config_loading_code = pandas_library["loading_codes"][0]
     first_code = first_config_loading_code["code"]
     first_config = first_config_loading_code["config_name"]
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
+    df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
     longest_col = longest_string_column(df)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
     replacements = [dataset_id, first_code, html_code, longest_col]
     has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
     has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
+    if notebook_type in ("rag", "embeddings") and not has_categoric_columns:
+        logging.error(
+            "Dataset does not have categorical columns, which are required for RAG generation."
+        )
+        yield (
+            "",
+            "## ❌ This dataset does not have categorical columns, which are required for Embeddings/RAG generation ❌",
+        )
+        return
+    if notebook_type == "eda" and not (has_categoric_columns or has_numeric_columns):
+        logging.error(
+            "Dataset does not have categorical or numeric columns, which are required for EDA generation."
+        )
+        yield (
+            "",
+            "## ❌ This dataset does not have categorical or numeric columns, which are required for EDA generation ❌",
+        )
+        return
     cells = replace_wildcards(
         cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
     )
     generated_text = ""
     # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
     for cell in cells:
+        if cell["cell_type"] == "markdown":
+            continue
+        generated_text += cell["source"] + "\n\n"
         yield generated_text, ""
         if generated_text.count("\n") > 38:
             generated_text += (
+                f"## See more lines available in the generated notebook 🤗 ......"
             )
             yield generated_text, ""
             break
     create_notebook_file(cells, notebook_name=notebook_name)
     _push_to_hub(dataset_id, notebook_name)
     notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
+    yield (
+        generated_text,
+        f"## ✅ Here you have the [generated notebook]({notebook_link}) ✅",
+    )
 with gr.Blocks(fill_height=True, fill_width=True) as demo:

utils/notebook_utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ def replace_wildcards(
             continue
         if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
             continue
-        tmp_text = tmp["source"]
         for wildcard, replacement in zip(wildcards, replacements):
             tmp_text = tmp_text.replace(wildcard, replacement)
         new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})

             continue
         if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
             continue
+        tmp_text = tmp["source"].strip()
         for wildcard, replacement in zip(wildcards, replacements):
             tmp_text = tmp_text.replace(wildcard, replacement)
         new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})