asoria HF staff commited on
Commit
4dc6cd8
·
1 Parent(s): e62a0e5

EDA template partially finished (need to filter numerical operations)

Browse files
Files changed (2) hide show
  1. app.py +4 -6
  2. utils/notebook_utils.py +72 -18
app.py CHANGED
@@ -15,8 +15,8 @@ from dotenv import load_dotenv
15
  import os
16
 
17
  # TODOS:
 
18
  # 2. Add template for RAG and embeddings
19
- # 3. Improve templates
20
 
21
  load_dotenv()
22
 
@@ -112,9 +112,6 @@ def _push_to_hub(
112
  repo_id=NOTEBOOKS_REPOSITORY,
113
  repo_type="dataset",
114
  )
115
- link = f"https://huggingface.co/datasets/{NOTEBOOKS_REPOSITORY}/blob/main/{notebook_name}"
116
- logging.info(f"Notebook pushed to hub: {link}")
117
- return link
118
  except Exception as e:
119
  logging.info("Failed to push notebook", e)
120
  raise
@@ -165,7 +162,8 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
165
  break
166
  notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
167
  create_notebook_file(cells, notebook_name=notebook_name)
168
- notebook_link = _push_to_hub(dataset_id, notebook_name)
 
169
  yield generated_text, f"## Here you have the [generated notebook]({notebook_link})"
170
 
171
 
@@ -185,7 +183,7 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
185
  dataset_samples = gr.Examples(
186
  examples=[
187
  [
188
- "infinite-dataset-hub/WorldPopCounts",
189
  "Try this dataset for Exploratory Data Analysis",
190
  ],
191
  [
 
15
  import os
16
 
17
  # TODOS:
18
+ # 1. Add cells by data types in EDA notebook
19
  # 2. Add template for RAG and embeddings
 
20
 
21
  load_dotenv()
22
 
 
112
  repo_id=NOTEBOOKS_REPOSITORY,
113
  repo_type="dataset",
114
  )
 
 
 
115
  except Exception as e:
116
  logging.info("Failed to push notebook", e)
117
  raise
 
162
  break
163
  notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
164
  create_notebook_file(cells, notebook_name=notebook_name)
165
+ _push_to_hub(dataset_id, notebook_name)
166
+ notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
167
  yield generated_text, f"## Here you have the [generated notebook]({notebook_link})"
168
 
169
 
 
183
  dataset_samples = gr.Examples(
184
  examples=[
185
  [
186
+ "scikit-learn/iris",
187
  "Try this dataset for Exploratory Data Analysis",
188
  ],
189
  [
utils/notebook_utils.py CHANGED
@@ -33,15 +33,16 @@ embeggins_cells = [
33
  eda_cells = [
34
  {
35
  "cell_type": "markdown",
36
- "source": "# Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset",
37
- },
38
- {
39
- "cell_type": "code",
40
  "source": """
41
- from IPython.display import HTML
42
- display(HTML("{html_code}"))
 
43
  """,
44
  },
 
 
 
 
45
  {
46
  "cell_type": "code",
47
  "source": """
@@ -60,14 +61,18 @@ import seaborn as sns
60
  {
61
  "cell_type": "code",
62
  "source": """
63
- # 2. Load the dataset as a DataFrame using the provided code
64
  {first_code}
65
  """,
66
  },
 
 
 
 
67
  {
68
  "cell_type": "code",
69
  "source": """
70
- # 3. Understand the dataset structure
71
  print(df.head())
72
  print(df.info())
73
  print(df.describe())
@@ -76,40 +81,89 @@ print(df.describe())
76
  {
77
  "cell_type": "code",
78
  "source": """
79
- # 4. Check for missing values
80
  print(df.isnull().sum())
81
  """,
82
  },
83
  {
84
  "cell_type": "code",
85
  "source": """
86
- # 5. Identify data types of each column
87
  print(df.dtypes)
88
  """,
89
  },
90
  {
91
  "cell_type": "code",
92
  "source": """
93
- # 6. Detect duplicated rows
94
  print(df.duplicated().sum())
95
  """,
96
  },
97
  {
98
  "cell_type": "code",
99
  "source": """
100
- # 7. Generate descriptive statistics
101
  print(df.describe())
102
  """,
103
  },
104
  {
105
  "cell_type": "code",
106
  "source": """
107
- # 8. Visualize the distribution of each column.
108
- # TODO: Add code to visualize the distribution of each column.
109
- # 9. Explore relationships between columns.
110
- # TODO: Add code to explore relationships between columns.
111
- # 10. Perform correlation analysis.
112
- # TODO: Add code to perform correlation analysis.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  """,
114
  },
115
  ]
 
33
  eda_cells = [
34
  {
35
  "cell_type": "markdown",
 
 
 
 
36
  "source": """
37
+ ---
38
+ # **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**
39
+ ---
40
  """,
41
  },
42
+ {
43
+ "cell_type": "markdown",
44
+ "source": "## 1. Setup necessary libraries and load the dataset",
45
+ },
46
  {
47
  "cell_type": "code",
48
  "source": """
 
61
  {
62
  "cell_type": "code",
63
  "source": """
64
+ # 2. Load the dataset as a DataFrame
65
  {first_code}
66
  """,
67
  },
68
+ {
69
+ "cell_type": "markdown",
70
+ "source": "## 2. Understanding the Dataset",
71
+ },
72
  {
73
  "cell_type": "code",
74
  "source": """
75
+ # First rows of the dataset and info
76
  print(df.head())
77
  print(df.info())
78
  print(df.describe())
 
81
  {
82
  "cell_type": "code",
83
  "source": """
84
+ # Check for missing values
85
  print(df.isnull().sum())
86
  """,
87
  },
88
  {
89
  "cell_type": "code",
90
  "source": """
91
+ # Identify data types of each column
92
  print(df.dtypes)
93
  """,
94
  },
95
  {
96
  "cell_type": "code",
97
  "source": """
98
+ # Detect duplicated rows
99
  print(df.duplicated().sum())
100
  """,
101
  },
102
  {
103
  "cell_type": "code",
104
  "source": """
105
+ # Generate descriptive statistics
106
  print(df.describe())
107
  """,
108
  },
109
  {
110
  "cell_type": "code",
111
  "source": """
112
+ # Unique values in categorical columns
113
+ df.select_dtypes(include=['object']).nunique()
114
+ """,
115
+ },
116
+ {
117
+ "cell_type": "markdown",
118
+ "source": "## 3. Data Visualization",
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "source": """
123
+ # Correlation matrix for numerical columns
124
+ corr_matrix = df.corr(numeric_only=True)
125
+ plt.figure(figsize=(10, 8))
126
+ sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
127
+ plt.title('Correlation Matrix')
128
+ plt.show()
129
+ """,
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "source": """
134
+ # Distribution plots for numerical columns
135
+ for column in df.select_dtypes(include=['int64', 'float64']).columns:
136
+ plt.figure(figsize=(8, 4))
137
+ sns.histplot(df[column], kde=True)
138
+ plt.title(f'Distribution of {column}')
139
+ plt.xlabel(column)
140
+ plt.ylabel('Frequency')
141
+ plt.show()
142
+ """,
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "source": """
147
+ # Count plots for categorical columns
148
+ for column in df.select_dtypes(include=['object']).columns:
149
+ plt.figure(figsize=(8, 4))
150
+ sns.countplot(x=column, data=df)
151
+ plt.title(f'Count Plot of {column}')
152
+ plt.xlabel(column)
153
+ plt.ylabel('Count')
154
+ plt.show()
155
+ """,
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "source": """
160
+ # Box plots for detecting outliers in numerical columns
161
+ for column in df.select_dtypes(include=['int64', 'float64']).columns:
162
+ plt.figure(figsize=(8, 4))
163
+ sns.boxplot(df[column])
164
+ plt.title(f'Box Plot of {column}')
165
+ plt.xlabel(column)
166
+ plt.show()
167
  """,
168
  },
169
  ]