baconnier commited on
Commit
8c15039
·
verified ·
1 Parent(s): b2f41cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -45
app.py CHANGED
@@ -45,6 +45,40 @@ class DataAnalyzer:
45
  os.remove(report_path)
46
  return html_with_table
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def generate_autoviz_report(self, df):
49
  if df is None:
50
  return "Please upload a dataset first"
@@ -55,40 +89,13 @@ class DataAnalyzer:
55
  os.makedirs(viz_temp_dir)
56
 
57
  try:
58
- # Data preprocessing
59
- df = df.copy()
60
-
61
- # Handle datetime columns
62
- for col in df.columns:
63
- try:
64
- df[col] = pd.to_datetime(df[col], errors='ignore')
65
- except:
66
- pass
67
-
68
- datetime_columns = df.select_dtypes(include=['datetime64']).columns
69
- for col in datetime_columns:
70
- df[f'{col}_year'] = df[col].dt.year
71
- df[f'{col}_month'] = df[col].dt.month
72
- df = df.drop(columns=[col])
73
-
74
- # Try to convert string columns to numeric where possible
75
- for col in df.select_dtypes(include=['object']).columns:
76
- try:
77
- df[col] = pd.to_numeric(df[col], errors='ignore')
78
- except:
79
- pass
80
-
81
- # Convert remaining string columns to categorical if cardinality is low
82
- object_columns = df.select_dtypes(include=['object']).columns
83
- for col in object_columns:
84
- if df[col].nunique() < 50:
85
- df[col] = df[col].astype('category')
86
 
87
- # Sample data if needed
88
  if len(df) > 5000:
89
  df = df.sample(n=5000, random_state=42)
90
 
91
- # Print data info for debugging
92
  print("\nDataset Info:")
93
  print(df.info())
94
  print("\nColumn Types:")
@@ -96,16 +103,16 @@ class DataAnalyzer:
96
 
97
  plt.close('all')
98
 
99
- # Run AutoViz
100
  dfte = self.AV.AutoViz(
101
  filename='',
102
  sep=',',
103
- depVar='',
104
  dfte=df,
105
  header=0,
106
  verbose=1,
107
  lowess=False,
108
- chart_format='svg',
109
  max_rows_analyzed=5000,
110
  max_cols_analyzed=30,
111
  save_plot_dir=viz_temp_dir
@@ -115,7 +122,7 @@ class DataAnalyzer:
115
  html_parts = []
116
  if os.path.exists(viz_temp_dir):
117
  for file in sorted(os.listdir(viz_temp_dir)):
118
- if file.endswith('.html') or file.endswith('.svg'):
119
  file_path = os.path.join(viz_temp_dir, file)
120
  try:
121
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -125,22 +132,23 @@ class DataAnalyzer:
125
  except Exception as e:
126
  print(f"Error reading file {file}: {str(e)}")
127
 
 
 
 
 
128
  if not html_parts:
129
  return f"""
130
  <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
131
  <h3>Data Summary</h3>
132
  <p>Total Rows: {len(df)}</p>
133
  <p>Total Columns: {len(df.columns)}</p>
134
- <p>Column Types:</p>
135
- <pre>{df.dtypes.to_string()}</pre>
 
 
136
  <hr>
137
- <h3>No visualizations were generated</h3>
138
- <p>This might be due to:</p>
139
- <ul>
140
- <li>All columns being categorical with high cardinality</li>
141
- <li>No numeric columns for analysis</li>
142
- <li>Data format not suitable for visualization</li>
143
- </ul>
144
  </div>
145
  """
146
 
@@ -151,8 +159,10 @@ class DataAnalyzer:
151
  <h3>Dataset Summary</h3>
152
  <p>Rows analyzed: {len(df)}</p>
153
  <p>Columns: {len(df.columns)}</p>
154
- <p>Column Types:</p>
155
- <pre>{df.dtypes.to_string()}</pre>
 
 
156
  </div>
157
  <hr>
158
  {'<hr>'.join(html_parts)}
@@ -197,6 +207,13 @@ def create_interface():
197
  with gr.Row():
198
  file_input = gr.File(label="Upload CSV")
199
  data_preview = gr.Dataframe(label="Data Preview", interactive=False)
 
 
 
 
 
 
 
200
 
201
  def load_data(file):
202
  if file is None:
@@ -218,6 +235,14 @@ def create_interface():
218
  with gr.Row():
219
  sweetviz_button = gr.Button("Generate Sweetviz Report")
220
  sweetviz_output = gr.HTML(label="Sweetviz Report")
 
 
 
 
 
 
 
 
221
 
222
  def generate_sweetviz(df):
223
  if df is None:
@@ -235,6 +260,14 @@ def create_interface():
235
  with gr.Row():
236
  autoviz_button = gr.Button("Generate AutoViz Report")
237
  autoviz_output = gr.HTML(label="AutoViz Report")
 
 
 
 
 
 
 
 
238
 
239
  def generate_autoviz(df):
240
  if df is None:
 
45
  os.remove(report_path)
46
  return html_with_table
47
 
48
+ def preprocess_dataframe(self, df):
49
+ """Preprocess dataframe for visualization"""
50
+ df = df.copy()
51
+
52
+ # Convert 'value' column to numeric if possible
53
+ try:
54
+ # Remove any currency symbols and commas
55
+ df['value'] = df['value'].replace('[\$,]', '', regex=True)
56
+ # Convert to float
57
+ df['value'] = pd.to_numeric(df['value'], errors='coerce')
58
+ except:
59
+ pass
60
+
61
+ # Handle datetime columns
62
+ for col in df.columns:
63
+ if df[col].dtype == 'object':
64
+ try:
65
+ df[col] = pd.to_datetime(df[col], errors='ignore')
66
+ except:
67
+ pass
68
+
69
+ datetime_columns = df.select_dtypes(include=['datetime64']).columns
70
+ for col in datetime_columns:
71
+ df[f'{col}_year'] = df[col].dt.year
72
+ df[f'{col}_month'] = df[col].dt.month
73
+ df = df.drop(columns=[col])
74
+
75
+ # Convert categorical columns with low cardinality
76
+ for col in df.select_dtypes(include=['object']).columns:
77
+ if df[col].nunique() < 50:
78
+ df[col] = df[col].astype('category')
79
+
80
+ return df
81
+
82
  def generate_autoviz_report(self, df):
83
  if df is None:
84
  return "Please upload a dataset first"
 
89
  os.makedirs(viz_temp_dir)
90
 
91
  try:
92
+ # Preprocess the dataframe
93
+ df = self.preprocess_dataframe(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ # Sample if needed
96
  if len(df) > 5000:
97
  df = df.sample(n=5000, random_state=42)
98
 
 
99
  print("\nDataset Info:")
100
  print(df.info())
101
  print("\nColumn Types:")
 
103
 
104
  plt.close('all')
105
 
106
+ # Run AutoViz with modified settings
107
  dfte = self.AV.AutoViz(
108
  filename='',
109
  sep=',',
110
+ depVar='value', # Set value as target variable
111
  dfte=df,
112
  header=0,
113
  verbose=1,
114
  lowess=False,
115
+ chart_format='html', # Changed back to html
116
  max_rows_analyzed=5000,
117
  max_cols_analyzed=30,
118
  save_plot_dir=viz_temp_dir
 
122
  html_parts = []
123
  if os.path.exists(viz_temp_dir):
124
  for file in sorted(os.listdir(viz_temp_dir)):
125
+ if file.endswith('.html'):
126
  file_path = os.path.join(viz_temp_dir, file)
127
  try:
128
  with open(file_path, 'r', encoding='utf-8') as f:
 
132
  except Exception as e:
133
  print(f"Error reading file {file}: {str(e)}")
134
 
135
+ # Generate summary statistics
136
+ numeric_summary = df.describe().to_html() if not df.select_dtypes(include=['number']).empty else ""
137
+ categorical_summary = df.describe(include=['category', 'object']).to_html() if not df.select_dtypes(include=['category', 'object']).empty else ""
138
+
139
  if not html_parts:
140
  return f"""
141
  <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
142
  <h3>Data Summary</h3>
143
  <p>Total Rows: {len(df)}</p>
144
  <p>Total Columns: {len(df.columns)}</p>
145
+ <h4>Numeric Summary:</h4>
146
+ {numeric_summary}
147
+ <h4>Categorical Summary:</h4>
148
+ {categorical_summary}
149
  <hr>
150
+ <h3>Column Types:</h3>
151
+ <pre>{df.dtypes.to_string()}</pre>
 
 
 
 
 
152
  </div>
153
  """
154
 
 
159
  <h3>Dataset Summary</h3>
160
  <p>Rows analyzed: {len(df)}</p>
161
  <p>Columns: {len(df.columns)}</p>
162
+ <h4>Numeric Summary:</h4>
163
+ {numeric_summary}
164
+ <h4>Categorical Summary:</h4>
165
+ {categorical_summary}
166
  </div>
167
  <hr>
168
  {'<hr>'.join(html_parts)}
 
207
  with gr.Row():
208
  file_input = gr.File(label="Upload CSV")
209
  data_preview = gr.Dataframe(label="Data Preview", interactive=False)
210
+ with gr.Row():
211
+ gr.Markdown("""
212
+ ### Data Preview Info
213
+ - Upload a CSV file to begin analysis
214
+ - First few rows will be shown here
215
+ - Data types and basic statistics will be displayed
216
+ """)
217
 
218
  def load_data(file):
219
  if file is None:
 
235
  with gr.Row():
236
  sweetviz_button = gr.Button("Generate Sweetviz Report")
237
  sweetviz_output = gr.HTML(label="Sweetviz Report")
238
+ with gr.Row():
239
+ gr.Markdown("""
240
+ ### Sweetviz Analysis Info
241
+ - Comprehensive data profiling
242
+ - Statistical analysis
243
+ - Feature correlations
244
+ - Missing value analysis
245
+ """)
246
 
247
  def generate_sweetviz(df):
248
  if df is None:
 
260
  with gr.Row():
261
  autoviz_button = gr.Button("Generate AutoViz Report")
262
  autoviz_output = gr.HTML(label="AutoViz Report")
263
+ with gr.Row():
264
+ gr.Markdown("""
265
+ ### AutoViz Analysis Info
266
+ - Automated visualization generation
267
+ - Distribution analysis
268
+ - Correlation plots
269
+ - Feature relationships
270
+ """)
271
 
272
  def generate_autoviz(df):
273
  if df is None: