baconnier commited on
Commit
96823ba
1 Parent(s): f421e23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -108
app.py CHANGED
@@ -12,6 +12,9 @@ import shutil
12
  import warnings
13
  import io
14
  import base64
 
 
 
15
  warnings.filterwarnings('ignore')
16
 
17
  class DataAnalyzer:
@@ -19,7 +22,55 @@ class DataAnalyzer:
19
  self.temp_dir = tempfile.mkdtemp()
20
  self.df = None
21
  self.AV = AutoViz_Class()
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def generate_sweetviz_report(self, df):
24
  if df is None:
25
  return "Please upload a dataset first"
@@ -47,71 +98,25 @@ class DataAnalyzer:
47
  os.remove(report_path)
48
  return html_with_table
49
 
50
- def preprocess_dataframe(self, df):
51
- df = df.copy()
52
-
53
- # Convert 'value' column to numeric if possible
54
- if 'value' in df.columns:
55
- df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce')
56
-
57
- # Handle datetime columns
58
- for col in df.columns:
59
- if df[col].dtype == 'object':
60
- try:
61
- df[col] = pd.to_datetime(df[col], errors='ignore')
62
- except:
63
- pass
64
-
65
- # Convert categorical columns with low cardinality
66
- for col in df.select_dtypes(include=['object']).columns:
67
- if df[col].nunique() < 50:
68
- df[col] = df[col].astype('category')
69
-
70
- return df
71
-
72
  def generate_autoviz_report(self, df):
73
  if df is None:
74
  return "Please upload a dataset first"
75
 
76
- viz_temp_dir = os.path.join(self.temp_dir, "autoviz_output")
77
- if os.path.exists(viz_temp_dir):
78
- shutil.rmtree(viz_temp_dir)
79
- os.makedirs(viz_temp_dir)
80
-
81
  try:
82
  # Preprocess the dataframe
83
- df = self.preprocess_dataframe(df)
 
 
 
 
84
 
85
  # Sample if needed
86
  if len(df) > 5000:
87
  df = df.sample(n=5000, random_state=42)
88
 
89
- print("\nDataset Info:")
90
- print(df.info())
91
- print("\nColumn Types:")
92
- print(df.dtypes)
93
-
94
- plt.close('all')
95
-
96
- # Create a directory for plots
97
- plots_dir = os.path.join(viz_temp_dir, "plots")
98
- os.makedirs(plots_dir, exist_ok=True)
99
 
100
- # Run AutoViz
101
- dfte = self.AV.AutoViz(
102
- filename='',
103
- sep=',',
104
- depVar='value', # Set value as target variable
105
- dfte=df,
106
- header=0,
107
- verbose=1,
108
- lowess=False,
109
- chart_format='html',
110
- max_rows_analyzed=5000,
111
- max_cols_analyzed=30,
112
- save_plot_dir=plots_dir
113
- )
114
-
115
  # Generate summary statistics
116
  numeric_cols = df.select_dtypes(include=['number']).columns
117
  categorical_cols = df.select_dtypes(include=['category', 'object']).columns
@@ -138,20 +143,20 @@ class DataAnalyzer:
138
  .table th {
139
  background-color: #f8f9fa;
140
  }
141
- pre {
142
- background-color: #f8f9fa;
143
- padding: 1rem;
144
- border-radius: 4px;
145
- }
146
- .viz-container {
147
  margin: 20px 0;
148
- padding: 20px;
149
  border: 1px solid #ddd;
150
  border-radius: 5px;
151
  }
 
 
 
 
152
  </style>
153
  """
154
 
 
155
  html_content += f"""
156
  <div class="viz-container">
157
  <h2 style="text-align: center;">Data Analysis Report</h2>
@@ -170,26 +175,18 @@ class DataAnalyzer:
170
  <div style="overflow-x: auto;">
171
  {categorical_stats.to_html(classes='table table-striped')}
172
  </div>
173
-
174
- <h3>Column Types</h3>
175
- <pre>{df.dtypes.to_string()}</pre>
176
  </div>
177
  """
178
-
179
- # Add plots if they exist
180
- if os.path.exists(plots_dir):
181
- for file in sorted(os.listdir(plots_dir)):
182
- if file.endswith('.html'):
183
- with open(os.path.join(plots_dir, file), 'r', encoding='utf-8') as f:
184
- plot_content = f.read()
185
- if plot_content.strip():
186
- html_content += f"""
187
- <div class="viz-container">
188
- <h3>{file.replace('.html', '').replace('_', ' ').title()}</h3>
189
- {plot_content}
190
- </div>
191
- """
192
-
193
  html_content += "</div>"
194
  return html_content
195
 
@@ -197,24 +194,13 @@ class DataAnalyzer:
197
  import traceback
198
  error_message = f"""
199
  <div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
200
- <h3>Error in AutoViz Analysis</h3>
201
  <p>Error details: {str(e)}</p>
202
  <p>Stack trace:</p>
203
  <pre>{traceback.format_exc()}</pre>
204
- <p>Dataset Info:</p>
205
- <pre>
206
- Rows: {len(df)}
207
- Columns: {len(df.columns)}
208
- Types:\n{df.dtypes.to_string()}
209
- </pre>
210
  </div>
211
  """
212
  return error_message
213
- finally:
214
- if os.path.exists(viz_temp_dir):
215
- shutil.rmtree(viz_temp_dir)
216
-
217
-
218
  def create_interface():
219
  analyzer = DataAnalyzer()
220
 
@@ -314,42 +300,41 @@ def create_interface():
314
  outputs=[sweetviz_output]
315
  )
316
 
317
- # Third Tab: AutoViz Analysis
318
- with gr.TabItem("AutoViz Analysis"):
319
  with gr.Row():
320
  with gr.Column(scale=2):
321
- autoviz_button = gr.Button(
322
- "Generate AutoViz Report",
323
  variant="primary"
324
  )
325
  with gr.Column(scale=1):
326
  gr.Markdown("""
327
- ### AutoViz Analysis Features
328
- - Automated visualization generation
329
- - Distribution analysis
330
- - Correlation plots
331
- - Feature relationships
332
- - Time series analysis (if applicable)
333
  """)
334
 
335
  with gr.Row():
336
- autoviz_output = gr.HTML(
337
- label="AutoViz Report",
338
- value="Click the button above to generate the report"
339
  )
340
 
341
- def generate_autoviz(df):
342
  if df is None:
343
  return "Please upload a dataset first"
344
  try:
345
  return analyzer.generate_autoviz_report(df)
346
  except Exception as e:
347
- return f"Error generating AutoViz report: {str(e)}"
348
 
349
- autoviz_button.click(
350
- fn=generate_autoviz,
351
  inputs=[current_df],
352
- outputs=[autoviz_output]
353
  )
354
 
355
  return demo
 
12
  import warnings
13
  import io
14
  import base64
15
+ from pathlib import Path
16
+ import matplotlib
17
+ matplotlib.use('Agg')
18
  warnings.filterwarnings('ignore')
19
 
20
  class DataAnalyzer:
 
22
  self.temp_dir = tempfile.mkdtemp()
23
  self.df = None
24
  self.AV = AutoViz_Class()
25
+ self.plots_memory = {} # Store plots in memory
26
 
27
+ def save_plot_to_memory(self, fig, plot_name):
28
+ """Save matplotlib figure to memory as base64"""
29
+ buf = io.BytesIO()
30
+ fig.savefig(buf, format='png', bbox_inches='tight')
31
+ buf.seek(0)
32
+ img_str = base64.b64encode(buf.getvalue()).decode()
33
+ self.plots_memory[plot_name] = f'data:image/png;base64,{img_str}'
34
+ plt.close(fig)
35
+
36
+ def generate_basic_plots(self, df):
37
+ """Generate basic matplotlib plots"""
38
+ # Numeric columns distribution
39
+ numeric_cols = df.select_dtypes(include=['number']).columns
40
+ for col in numeric_cols:
41
+ fig, ax = plt.subplots(figsize=(10, 6))
42
+ df[col].hist(bins=30, ax=ax)
43
+ ax.set_title(f'Distribution of {col}')
44
+ self.save_plot_to_memory(fig, f'dist_{col}')
45
+
46
+ # Box plot
47
+ fig, ax = plt.subplots(figsize=(10, 6))
48
+ df.boxplot(column=col, ax=ax)
49
+ ax.set_title(f'Box Plot of {col}')
50
+ self.save_plot_to_memory(fig, f'box_{col}')
51
+
52
+ # Categorical columns
53
+ categorical_cols = df.select_dtypes(include=['category', 'object']).columns
54
+ for col in categorical_cols:
55
+ if df[col].nunique() < 20: # Only for columns with reasonable number of categories
56
+ fig, ax = plt.subplots(figsize=(12, 6))
57
+ df[col].value_counts().plot(kind='bar', ax=ax)
58
+ ax.set_title(f'Distribution of {col}')
59
+ plt.xticks(rotation=45)
60
+ self.save_plot_to_memory(fig, f'cat_{col}')
61
+
62
+ # Correlation matrix for numeric columns
63
+ if len(numeric_cols) > 1:
64
+ fig, ax = plt.subplots(figsize=(10, 8))
65
+ correlation_matrix = df[numeric_cols].corr()
66
+ im = ax.imshow(correlation_matrix)
67
+ ax.set_xticks(range(len(numeric_cols)))
68
+ ax.set_yticks(range(len(numeric_cols)))
69
+ ax.set_xticklabels(numeric_cols, rotation=45)
70
+ ax.set_yticklabels(numeric_cols)
71
+ plt.colorbar(im)
72
+ ax.set_title('Correlation Matrix')
73
+ self.save_plot_to_memory(fig, 'correlation_matrix')
74
  def generate_sweetviz_report(self, df):
75
  if df is None:
76
  return "Please upload a dataset first"
 
98
  os.remove(report_path)
99
  return html_with_table
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def generate_autoviz_report(self, df):
102
  if df is None:
103
  return "Please upload a dataset first"
104
 
 
 
 
 
 
105
  try:
106
  # Preprocess the dataframe
107
+ df = df.copy()
108
+
109
+ # Convert 'value' column to numeric if possible
110
+ if 'value' in df.columns:
111
+ df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce')
112
 
113
  # Sample if needed
114
  if len(df) > 5000:
115
  df = df.sample(n=5000, random_state=42)
116
 
117
+ # Generate basic plots
118
+ self.generate_basic_plots(df)
 
 
 
 
 
 
 
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  # Generate summary statistics
121
  numeric_cols = df.select_dtypes(include=['number']).columns
122
  categorical_cols = df.select_dtypes(include=['category', 'object']).columns
 
143
  .table th {
144
  background-color: #f8f9fa;
145
  }
146
+ .plot-container {
 
 
 
 
 
147
  margin: 20px 0;
148
+ padding: 10px;
149
  border: 1px solid #ddd;
150
  border-radius: 5px;
151
  }
152
+ .plot-container img {
153
+ max-width: 100%;
154
+ height: auto;
155
+ }
156
  </style>
157
  """
158
 
159
+ # Add summary statistics
160
  html_content += f"""
161
  <div class="viz-container">
162
  <h2 style="text-align: center;">Data Analysis Report</h2>
 
175
  <div style="overflow-x: auto;">
176
  {categorical_stats.to_html(classes='table table-striped')}
177
  </div>
 
 
 
178
  </div>
179
  """
180
+
181
+ # Add plots from memory
182
+ for plot_name, plot_data in self.plots_memory.items():
183
+ html_content += f"""
184
+ <div class="plot-container">
185
+ <h3>{plot_name.replace('_', ' ').title()}</h3>
186
+ <img src="{plot_data}" alt="{plot_name}">
187
+ </div>
188
+ """
189
+
 
 
 
 
 
190
  html_content += "</div>"
191
  return html_content
192
 
 
194
  import traceback
195
  error_message = f"""
196
  <div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
197
+ <h3>Error in Analysis</h3>
198
  <p>Error details: {str(e)}</p>
199
  <p>Stack trace:</p>
200
  <pre>{traceback.format_exc()}</pre>
 
 
 
 
 
 
201
  </div>
202
  """
203
  return error_message
 
 
 
 
 
204
  def create_interface():
205
  analyzer = DataAnalyzer()
206
 
 
300
  outputs=[sweetviz_output]
301
  )
302
 
303
+ # Third Tab: Visual Analysis
304
+ with gr.TabItem("Visual Analysis"):
305
  with gr.Row():
306
  with gr.Column(scale=2):
307
+ viz_button = gr.Button(
308
+ "Generate Visualizations",
309
  variant="primary"
310
  )
311
  with gr.Column(scale=1):
312
  gr.Markdown("""
313
+ ### Visualization Features
314
+ - Distribution plots
315
+ - Correlation analysis
316
+ - Categorical variable analysis
317
+ - Statistical summaries
 
318
  """)
319
 
320
  with gr.Row():
321
+ viz_output = gr.HTML(
322
+ label="Visualization Report",
323
+ value="Click the button above to generate visualizations"
324
  )
325
 
326
+ def generate_viz(df):
327
  if df is None:
328
  return "Please upload a dataset first"
329
  try:
330
  return analyzer.generate_autoviz_report(df)
331
  except Exception as e:
332
+ return f"Error generating visualizations: {str(e)}"
333
 
334
+ viz_button.click(
335
+ fn=generate_viz,
336
  inputs=[current_df],
337
+ outputs=[viz_output]
338
  )
339
 
340
  return demo