Spaces:
Running
Running
Minor details
Browse files- app.py +47 -24
- utils/notebook_utils.py +1 -1
app.py
CHANGED
@@ -14,8 +14,8 @@ from utils.notebook_utils import (
|
|
14 |
from dotenv import load_dotenv
|
15 |
import os
|
16 |
|
17 |
-
#
|
18 |
-
#
|
19 |
# Add template for training
|
20 |
|
21 |
load_dotenv()
|
@@ -76,22 +76,12 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
|
|
76 |
rows = content["rows"]
|
77 |
rows = [row["row"] for row in rows]
|
78 |
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
|
79 |
-
|
80 |
-
features_dict = {feature["name"]: feature["type"] for feature in features}
|
81 |
-
return features_dict, first_rows_df
|
82 |
except Exception as e:
|
83 |
logging.error(f"Error fetching first rows: {e}")
|
84 |
raise
|
85 |
|
86 |
|
87 |
-
def generate_eda_cells(dataset_id):
|
88 |
-
yield from generate_cells(dataset_id, eda_cells, "eda")
|
89 |
-
|
90 |
-
|
91 |
-
def generate_rag_cells(dataset_id):
|
92 |
-
yield from generate_cells(dataset_id, rag_cells, "rag")
|
93 |
-
|
94 |
-
|
95 |
def longest_string_column(df):
|
96 |
longest_col = None
|
97 |
max_length = 0
|
@@ -105,6 +95,14 @@ def longest_string_column(df):
|
|
105 |
return longest_col
|
106 |
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
def generate_embedding_cells(dataset_id):
|
109 |
yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
|
110 |
|
@@ -131,6 +129,7 @@ def _push_to_hub(
|
|
131 |
|
132 |
|
133 |
def generate_cells(dataset_id, cells, notebook_type="eda"):
|
|
|
134 |
try:
|
135 |
libraries = get_compatible_libraries(dataset_id)
|
136 |
except Exception as err:
|
@@ -139,23 +138,22 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
|
|
139 |
return []
|
140 |
|
141 |
if not libraries:
|
142 |
-
|
143 |
-
|
144 |
-
return
|
145 |
-
|
146 |
pandas_library = next(
|
147 |
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
|
148 |
None,
|
149 |
)
|
150 |
if not pandas_library:
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
first_config_loading_code = pandas_library["loading_codes"][0]
|
155 |
first_code = first_config_loading_code["code"]
|
156 |
first_config = first_config_loading_code["config_name"]
|
157 |
first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
|
158 |
-
|
159 |
|
160 |
longest_col = longest_string_column(df)
|
161 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
@@ -163,17 +161,39 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
|
|
163 |
replacements = [dataset_id, first_code, html_code, longest_col]
|
164 |
has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
|
165 |
has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
cells = replace_wildcards(
|
167 |
cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
|
168 |
)
|
169 |
generated_text = ""
|
170 |
# Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
|
171 |
for cell in cells:
|
172 |
-
|
|
|
|
|
173 |
yield generated_text, ""
|
174 |
if generated_text.count("\n") > 38:
|
175 |
generated_text += (
|
176 |
-
f"## See more lines available in the generated notebook
|
177 |
)
|
178 |
yield generated_text, ""
|
179 |
break
|
@@ -181,7 +201,10 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
|
|
181 |
create_notebook_file(cells, notebook_name=notebook_name)
|
182 |
_push_to_hub(dataset_id, notebook_name)
|
183 |
notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
|
184 |
-
yield
|
|
|
|
|
|
|
185 |
|
186 |
|
187 |
with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
|
14 |
from dotenv import load_dotenv
|
15 |
import os
|
16 |
|
17 |
+
# TODOs:
|
18 |
+
# Improve UI code preview
|
19 |
# Add template for training
|
20 |
|
21 |
load_dotenv()
|
|
|
76 |
rows = content["rows"]
|
77 |
rows = [row["row"] for row in rows]
|
78 |
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
|
79 |
+
return first_rows_df
|
|
|
|
|
80 |
except Exception as e:
|
81 |
logging.error(f"Error fetching first rows: {e}")
|
82 |
raise
|
83 |
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
def longest_string_column(df):
|
86 |
longest_col = None
|
87 |
max_length = 0
|
|
|
95 |
return longest_col
|
96 |
|
97 |
|
98 |
+
def generate_eda_cells(dataset_id):
|
99 |
+
yield from generate_cells(dataset_id, eda_cells, "eda")
|
100 |
+
|
101 |
+
|
102 |
+
def generate_rag_cells(dataset_id):
|
103 |
+
yield from generate_cells(dataset_id, rag_cells, "rag")
|
104 |
+
|
105 |
+
|
106 |
def generate_embedding_cells(dataset_id):
|
107 |
yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
|
108 |
|
|
|
129 |
|
130 |
|
131 |
def generate_cells(dataset_id, cells, notebook_type="eda"):
|
132 |
+
logging.info(f"Generating notebook for dataset {dataset_id}")
|
133 |
try:
|
134 |
libraries = get_compatible_libraries(dataset_id)
|
135 |
except Exception as err:
|
|
|
138 |
return []
|
139 |
|
140 |
if not libraries:
|
141 |
+
logging.error(f"Dataset not compatible with pandas library - not libraries")
|
142 |
+
yield "", "## β This dataset is not compatible with pandas library β"
|
143 |
+
return
|
|
|
144 |
pandas_library = next(
|
145 |
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
|
146 |
None,
|
147 |
)
|
148 |
if not pandas_library:
|
149 |
+
logging.error("Dataset not compatible with pandas library - not pandas library")
|
150 |
+
yield "", "## β This dataset is not compatible with pandas library β"
|
151 |
+
return
|
152 |
first_config_loading_code = pandas_library["loading_codes"][0]
|
153 |
first_code = first_config_loading_code["code"]
|
154 |
first_config = first_config_loading_code["config_name"]
|
155 |
first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
|
156 |
+
df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
|
157 |
|
158 |
longest_col = longest_string_column(df)
|
159 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
|
|
161 |
replacements = [dataset_id, first_code, html_code, longest_col]
|
162 |
has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
|
163 |
has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
|
164 |
+
|
165 |
+
if notebook_type in ("rag", "embeddings") and not has_categoric_columns:
|
166 |
+
logging.error(
|
167 |
+
"Dataset does not have categorical columns, which are required for RAG generation."
|
168 |
+
)
|
169 |
+
yield (
|
170 |
+
"",
|
171 |
+
"## β This dataset does not have categorical columns, which are required for Embeddings/RAG generation β",
|
172 |
+
)
|
173 |
+
return
|
174 |
+
if notebook_type == "eda" and not (has_categoric_columns or has_numeric_columns):
|
175 |
+
logging.error(
|
176 |
+
"Dataset does not have categorical or numeric columns, which are required for EDA generation."
|
177 |
+
)
|
178 |
+
yield (
|
179 |
+
"",
|
180 |
+
"## β This dataset does not have categorical or numeric columns, which are required for EDA generation β",
|
181 |
+
)
|
182 |
+
return
|
183 |
+
|
184 |
cells = replace_wildcards(
|
185 |
cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
|
186 |
)
|
187 |
generated_text = ""
|
188 |
# Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
|
189 |
for cell in cells:
|
190 |
+
if cell["cell_type"] == "markdown":
|
191 |
+
continue
|
192 |
+
generated_text += cell["source"] + "\n\n"
|
193 |
yield generated_text, ""
|
194 |
if generated_text.count("\n") > 38:
|
195 |
generated_text += (
|
196 |
+
f"## See more lines available in the generated notebook π€ ......"
|
197 |
)
|
198 |
yield generated_text, ""
|
199 |
break
|
|
|
201 |
create_notebook_file(cells, notebook_name=notebook_name)
|
202 |
_push_to_hub(dataset_id, notebook_name)
|
203 |
notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
|
204 |
+
yield (
|
205 |
+
generated_text,
|
206 |
+
f"## β
Here you have the [generated notebook]({notebook_link}) β
",
|
207 |
+
)
|
208 |
|
209 |
|
210 |
with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
utils/notebook_utils.py
CHANGED
@@ -12,7 +12,7 @@ def replace_wildcards(
|
|
12 |
continue
|
13 |
if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
|
14 |
continue
|
15 |
-
tmp_text = tmp["source"]
|
16 |
for wildcard, replacement in zip(wildcards, replacements):
|
17 |
tmp_text = tmp_text.replace(wildcard, replacement)
|
18 |
new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
|
|
|
12 |
continue
|
13 |
if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
|
14 |
continue
|
15 |
+
tmp_text = tmp["source"].strip()
|
16 |
for wildcard, replacement in zip(wildcards, replacements):
|
17 |
tmp_text = tmp_text.replace(wildcard, replacement)
|
18 |
new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
|