baconnier commited on
Commit
947739b
·
verified ·
1 Parent(s): 5bda113

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -20
app.py CHANGED
@@ -7,18 +7,17 @@ import category_encoders as ce
7
  import umap
8
  import matplotlib.pyplot as plt
9
  from sklearn.preprocessing import StandardScaler
10
- import seaborn as sns
11
- import numpy as np
12
- import io
13
- import base64
14
 
15
  class DataAnalyzer:
16
  def __init__(self):
17
  self.temp_dir = tempfile.mkdtemp()
18
  self.df = None
 
19
 
20
  def generate_sweetviz_report(self, df):
21
- self.df = df # Store DataFrame for other analyses
22
  report = sv.analyze(df)
23
  report_path = os.path.join(self.temp_dir, "report.html")
24
  report.show_html(report_path, open_browser=False)
@@ -41,14 +40,64 @@ class DataAnalyzer:
41
  os.remove(report_path)
42
  return html_with_table
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def encode_and_visualize(self, column_name, encoder_type='binary'):
45
  if self.df is None or column_name not in self.df.columns:
46
  return None
47
 
48
- # Create DataFrame with only the selected column
49
  df_subset = self.df[[column_name]].copy()
50
 
51
- # Select encoder
52
  encoders = {
53
  'binary': ce.BinaryEncoder(),
54
  'onehot': ce.OneHotEncoder(),
@@ -57,15 +106,11 @@ class DataAnalyzer:
57
  }
58
 
59
  encoder = encoders.get(encoder_type)
60
-
61
- # Encode data
62
  encoded_df = encoder.fit_transform(df_subset)
63
 
64
- # Scale the encoded features
65
  scaler = StandardScaler()
66
  scaled_data = scaler.fit_transform(encoded_df)
67
 
68
- # Apply UMAP
69
  reducer = umap.UMAP(
70
  n_neighbors=15,
71
  min_dist=0.1,
@@ -75,7 +120,6 @@ class DataAnalyzer:
75
 
76
  embedding = reducer.fit_transform(scaled_data)
77
 
78
- # Create visualization
79
  plt.figure(figsize=(10, 6))
80
  scatter = plt.scatter(
81
  embedding[:, 0],
@@ -90,7 +134,6 @@ class DataAnalyzer:
90
  plt.xlabel('UMAP1')
91
  plt.ylabel('UMAP2')
92
 
93
- # Save plot to bytes
94
  buf = io.BytesIO()
95
  plt.savefig(buf, format='png', bbox_inches='tight')
96
  plt.close()
@@ -109,6 +152,9 @@ def create_interface():
109
  file_input = gr.File(label="Upload CSV")
110
  report_html = gr.HTML()
111
 
 
 
 
112
  with gr.TabItem("Categorical Analysis"):
113
  with gr.Row():
114
  column_dropdown = gr.Dropdown(
@@ -126,29 +172,36 @@ def create_interface():
126
 
127
  def process_file(file):
128
  if file is None:
129
- return None, gr.Dropdown(choices=[])
130
 
131
  try:
132
  df = pd.read_csv(file.name)
133
- # Get categorical columns
134
  cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
135
- return analyzer.generate_sweetviz_report(df), gr.Dropdown(choices=cat_columns)
 
 
 
 
 
 
 
 
 
136
  except Exception as e:
137
- return f"Error generating report: {str(e)}", gr.Dropdown(choices=[])
138
 
139
  def update_plot(column, encoder_type):
140
  if column is None:
141
  return None
142
  try:
143
- plot_bytes = analyzer.encode_and_visualize(column, encoder_type)
144
- return plot_bytes
145
  except Exception as e:
146
  return None
147
 
148
  file_input.change(
149
  fn=process_file,
150
  inputs=[file_input],
151
- outputs=[report_html, column_dropdown]
152
  )
153
 
154
  column_dropdown.change(
 
7
  import umap
8
  import matplotlib.pyplot as plt
9
  from sklearn.preprocessing import StandardScaler
10
+ from autoviz.AutoViz_Class import AutoViz_Class
11
+ import shutil
 
 
12
 
13
  class DataAnalyzer:
14
  def __init__(self):
15
  self.temp_dir = tempfile.mkdtemp()
16
  self.df = None
17
+ self.AV = AutoViz_Class()
18
 
19
  def generate_sweetviz_report(self, df):
20
+ self.df = df
21
  report = sv.analyze(df)
22
  report_path = os.path.join(self.temp_dir, "report.html")
23
  report.show_html(report_path, open_browser=False)
 
40
  os.remove(report_path)
41
  return html_with_table
42
 
43
+ def generate_autoviz_report(self, df):
44
+ """Generate AutoViz report and return the HTML content"""
45
+ # Create a temporary directory for AutoViz output
46
+ viz_temp_dir = os.path.join(self.temp_dir, "autoviz")
47
+ if os.path.exists(viz_temp_dir):
48
+ shutil.rmtree(viz_temp_dir)
49
+ os.makedirs(viz_temp_dir)
50
+
51
+ try:
52
+ # Generate AutoViz report
53
+ dft = self.AV.AutoViz(
54
+ filename='',
55
+ sep=',',
56
+ depVar='',
57
+ dfte=df,
58
+ header=0,
59
+ verbose=0,
60
+ lowess=False,
61
+ chart_format='html',
62
+ max_rows_analyzed=150000,
63
+ save_plot_dir=viz_temp_dir
64
+ )
65
+
66
+ # Combine all HTML files into one
67
+ html_content = ""
68
+ for file in sorted(os.listdir(viz_temp_dir)):
69
+ if file.endswith('.html'):
70
+ with open(os.path.join(viz_temp_dir, file), 'r', encoding='utf-8') as f:
71
+ html_content += f.read() + "<br><hr><br>"
72
+
73
+ # Wrap the content in a scrollable div
74
+ html_with_table = f"""
75
+ <table width="100%" style="border-collapse: collapse;">
76
+ <tr>
77
+ <td style="padding: 20px; border: 1px solid #ddd;">
78
+ <div style="height: 800px; overflow: auto;">
79
+ {html_content}
80
+ </div>
81
+ </td>
82
+ </tr>
83
+ </table>
84
+ """
85
+
86
+ return html_with_table
87
+
88
+ except Exception as e:
89
+ return f"Error generating AutoViz report: {str(e)}"
90
+ finally:
91
+ # Clean up
92
+ if os.path.exists(viz_temp_dir):
93
+ shutil.rmtree(viz_temp_dir)
94
+
95
  def encode_and_visualize(self, column_name, encoder_type='binary'):
96
  if self.df is None or column_name not in self.df.columns:
97
  return None
98
 
 
99
  df_subset = self.df[[column_name]].copy()
100
 
 
101
  encoders = {
102
  'binary': ce.BinaryEncoder(),
103
  'onehot': ce.OneHotEncoder(),
 
106
  }
107
 
108
  encoder = encoders.get(encoder_type)
 
 
109
  encoded_df = encoder.fit_transform(df_subset)
110
 
 
111
  scaler = StandardScaler()
112
  scaled_data = scaler.fit_transform(encoded_df)
113
 
 
114
  reducer = umap.UMAP(
115
  n_neighbors=15,
116
  min_dist=0.1,
 
120
 
121
  embedding = reducer.fit_transform(scaled_data)
122
 
 
123
  plt.figure(figsize=(10, 6))
124
  scatter = plt.scatter(
125
  embedding[:, 0],
 
134
  plt.xlabel('UMAP1')
135
  plt.ylabel('UMAP2')
136
 
 
137
  buf = io.BytesIO()
138
  plt.savefig(buf, format='png', bbox_inches='tight')
139
  plt.close()
 
152
  file_input = gr.File(label="Upload CSV")
153
  report_html = gr.HTML()
154
 
155
+ with gr.TabItem("AutoViz Analysis"):
156
+ autoviz_html = gr.HTML()
157
+
158
  with gr.TabItem("Categorical Analysis"):
159
  with gr.Row():
160
  column_dropdown = gr.Dropdown(
 
172
 
173
  def process_file(file):
174
  if file is None:
175
+ return None, None, gr.Dropdown(choices=[])
176
 
177
  try:
178
  df = pd.read_csv(file.name)
 
179
  cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
180
+
181
+ # Generate both reports
182
+ sweetviz_report = analyzer.generate_sweetviz_report(df)
183
+ autoviz_report = analyzer.generate_autoviz_report(df)
184
+
185
+ return (
186
+ sweetviz_report,
187
+ autoviz_report,
188
+ gr.Dropdown(choices=cat_columns)
189
+ )
190
  except Exception as e:
191
+ return f"Error: {str(e)}", None, gr.Dropdown(choices=[])
192
 
193
  def update_plot(column, encoder_type):
194
  if column is None:
195
  return None
196
  try:
197
+ return analyzer.encode_and_visualize(column, encoder_type)
 
198
  except Exception as e:
199
  return None
200
 
201
  file_input.change(
202
  fn=process_file,
203
  inputs=[file_input],
204
+ outputs=[report_html, autoviz_html, column_dropdown]
205
  )
206
 
207
  column_dropdown.change(