asoria HF staff commited on
Commit
7f6f34c
β€’
1 Parent(s): ba1088f

Minor details

Browse files
Files changed (2) hide show
  1. app.py +47 -24
  2. utils/notebook_utils.py +1 -1
app.py CHANGED
@@ -14,8 +14,8 @@ from utils.notebook_utils import (
14
  from dotenv import load_dotenv
15
  import os
16
 
17
- # TODOS:
18
- # Validate dataset type for type before generating the notebook
19
  # Add template for training
20
 
21
  load_dotenv()
@@ -76,22 +76,12 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
76
  rows = content["rows"]
77
  rows = [row["row"] for row in rows]
78
  first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
79
- features = content["features"]
80
- features_dict = {feature["name"]: feature["type"] for feature in features}
81
- return features_dict, first_rows_df
82
  except Exception as e:
83
  logging.error(f"Error fetching first rows: {e}")
84
  raise
85
 
86
 
87
- def generate_eda_cells(dataset_id):
88
- yield from generate_cells(dataset_id, eda_cells, "eda")
89
-
90
-
91
- def generate_rag_cells(dataset_id):
92
- yield from generate_cells(dataset_id, rag_cells, "rag")
93
-
94
-
95
  def longest_string_column(df):
96
  longest_col = None
97
  max_length = 0
@@ -105,6 +95,14 @@ def longest_string_column(df):
105
  return longest_col
106
 
107
 
 
 
 
 
 
 
 
 
108
  def generate_embedding_cells(dataset_id):
109
  yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
110
 
@@ -131,6 +129,7 @@ def _push_to_hub(
131
 
132
 
133
  def generate_cells(dataset_id, cells, notebook_type="eda"):
 
134
  try:
135
  libraries = get_compatible_libraries(dataset_id)
136
  except Exception as err:
@@ -139,23 +138,22 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
139
  return []
140
 
141
  if not libraries:
142
- gr.Error("Dataset not compatible with pandas library.")
143
- logging.error(f"Dataset not compatible with pandas library")
144
- return gr.File(visible=False), gr.Row.update(visible=False)
145
-
146
  pandas_library = next(
147
  (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
148
  None,
149
  )
150
  if not pandas_library:
151
- gr.Error("Dataset not compatible with pandas library.")
152
- return []
153
-
154
  first_config_loading_code = pandas_library["loading_codes"][0]
155
  first_code = first_config_loading_code["code"]
156
  first_config = first_config_loading_code["config_name"]
157
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
158
- features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
159
 
160
  longest_col = longest_string_column(df)
161
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
@@ -163,17 +161,39 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
163
  replacements = [dataset_id, first_code, html_code, longest_col]
164
  has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
165
  has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  cells = replace_wildcards(
167
  cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
168
  )
169
  generated_text = ""
170
  # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
171
  for cell in cells:
172
- generated_text += cell["source"] + "\n"
 
 
173
  yield generated_text, ""
174
  if generated_text.count("\n") > 38:
175
  generated_text += (
176
- f"## See more lines available in the generated notebook :) ......"
177
  )
178
  yield generated_text, ""
179
  break
@@ -181,7 +201,10 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
181
  create_notebook_file(cells, notebook_name=notebook_name)
182
  _push_to_hub(dataset_id, notebook_name)
183
  notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
184
- yield generated_text, f"## Here you have the [generated notebook]({notebook_link})"
 
 
 
185
 
186
 
187
  with gr.Blocks(fill_height=True, fill_width=True) as demo:
 
14
  from dotenv import load_dotenv
15
  import os
16
 
17
+ # TODOs:
18
+ # Improve UI code preview
19
  # Add template for training
20
 
21
  load_dotenv()
 
76
  rows = content["rows"]
77
  rows = [row["row"] for row in rows]
78
  first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
79
+ return first_rows_df
 
 
80
  except Exception as e:
81
  logging.error(f"Error fetching first rows: {e}")
82
  raise
83
 
84
 
 
 
 
 
 
 
 
 
85
  def longest_string_column(df):
86
  longest_col = None
87
  max_length = 0
 
95
  return longest_col
96
 
97
 
98
+ def generate_eda_cells(dataset_id):
99
+ yield from generate_cells(dataset_id, eda_cells, "eda")
100
+
101
+
102
+ def generate_rag_cells(dataset_id):
103
+ yield from generate_cells(dataset_id, rag_cells, "rag")
104
+
105
+
106
  def generate_embedding_cells(dataset_id):
107
  yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
108
 
 
129
 
130
 
131
  def generate_cells(dataset_id, cells, notebook_type="eda"):
132
+ logging.info(f"Generating notebook for dataset {dataset_id}")
133
  try:
134
  libraries = get_compatible_libraries(dataset_id)
135
  except Exception as err:
 
138
  return []
139
 
140
  if not libraries:
141
+ logging.error(f"Dataset not compatible with pandas library - not libraries")
142
+ yield "", "## ❌ This dataset is not compatible with pandas library ❌"
143
+ return
 
144
  pandas_library = next(
145
  (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
146
  None,
147
  )
148
  if not pandas_library:
149
+ logging.error("Dataset not compatible with pandas library - not pandas library")
150
+ yield "", "## ❌ This dataset is not compatible with pandas library ❌"
151
+ return
152
  first_config_loading_code = pandas_library["loading_codes"][0]
153
  first_code = first_config_loading_code["code"]
154
  first_config = first_config_loading_code["config_name"]
155
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
156
+ df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
157
 
158
  longest_col = longest_string_column(df)
159
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
 
161
  replacements = [dataset_id, first_code, html_code, longest_col]
162
  has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
163
  has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
164
+
165
+ if notebook_type in ("rag", "embeddings") and not has_categoric_columns:
166
+ logging.error(
167
+ "Dataset does not have categorical columns, which are required for RAG generation."
168
+ )
169
+ yield (
170
+ "",
171
+ "## ❌ This dataset does not have categorical columns, which are required for Embeddings/RAG generation ❌",
172
+ )
173
+ return
174
+ if notebook_type == "eda" and not (has_categoric_columns or has_numeric_columns):
175
+ logging.error(
176
+ "Dataset does not have categorical or numeric columns, which are required for EDA generation."
177
+ )
178
+ yield (
179
+ "",
180
+ "## ❌ This dataset does not have categorical or numeric columns, which are required for EDA generation ❌",
181
+ )
182
+ return
183
+
184
  cells = replace_wildcards(
185
  cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
186
  )
187
  generated_text = ""
188
  # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
189
  for cell in cells:
190
+ if cell["cell_type"] == "markdown":
191
+ continue
192
+ generated_text += cell["source"] + "\n\n"
193
  yield generated_text, ""
194
  if generated_text.count("\n") > 38:
195
  generated_text += (
196
+ f"## See more lines available in the generated notebook πŸ€— ......"
197
  )
198
  yield generated_text, ""
199
  break
 
201
  create_notebook_file(cells, notebook_name=notebook_name)
202
  _push_to_hub(dataset_id, notebook_name)
203
  notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
204
+ yield (
205
+ generated_text,
206
+ f"## βœ… Here you have the [generated notebook]({notebook_link}) βœ…",
207
+ )
208
 
209
 
210
  with gr.Blocks(fill_height=True, fill_width=True) as demo:
utils/notebook_utils.py CHANGED
@@ -12,7 +12,7 @@ def replace_wildcards(
12
  continue
13
  if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
14
  continue
15
- tmp_text = tmp["source"]
16
  for wildcard, replacement in zip(wildcards, replacements):
17
  tmp_text = tmp_text.replace(wildcard, replacement)
18
  new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
 
12
  continue
13
  if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
14
  continue
15
+ tmp_text = tmp["source"].strip()
16
  for wildcard, replacement in zip(wildcards, replacements):
17
  tmp_text = tmp_text.replace(wildcard, replacement)
18
  new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})