Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,9 @@ import shutil
|
|
12 |
import warnings
|
13 |
import io
|
14 |
import base64
|
|
|
|
|
|
|
15 |
warnings.filterwarnings('ignore')
|
16 |
|
17 |
class DataAnalyzer:
|
@@ -19,7 +22,55 @@ class DataAnalyzer:
|
|
19 |
self.temp_dir = tempfile.mkdtemp()
|
20 |
self.df = None
|
21 |
self.AV = AutoViz_Class()
|
|
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def generate_sweetviz_report(self, df):
|
24 |
if df is None:
|
25 |
return "Please upload a dataset first"
|
@@ -47,71 +98,25 @@ class DataAnalyzer:
|
|
47 |
os.remove(report_path)
|
48 |
return html_with_table
|
49 |
|
50 |
-
def preprocess_dataframe(self, df):
|
51 |
-
df = df.copy()
|
52 |
-
|
53 |
-
# Convert 'value' column to numeric if possible
|
54 |
-
if 'value' in df.columns:
|
55 |
-
df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce')
|
56 |
-
|
57 |
-
# Handle datetime columns
|
58 |
-
for col in df.columns:
|
59 |
-
if df[col].dtype == 'object':
|
60 |
-
try:
|
61 |
-
df[col] = pd.to_datetime(df[col], errors='ignore')
|
62 |
-
except:
|
63 |
-
pass
|
64 |
-
|
65 |
-
# Convert categorical columns with low cardinality
|
66 |
-
for col in df.select_dtypes(include=['object']).columns:
|
67 |
-
if df[col].nunique() < 50:
|
68 |
-
df[col] = df[col].astype('category')
|
69 |
-
|
70 |
-
return df
|
71 |
-
|
72 |
def generate_autoviz_report(self, df):
|
73 |
if df is None:
|
74 |
return "Please upload a dataset first"
|
75 |
|
76 |
-
viz_temp_dir = os.path.join(self.temp_dir, "autoviz_output")
|
77 |
-
if os.path.exists(viz_temp_dir):
|
78 |
-
shutil.rmtree(viz_temp_dir)
|
79 |
-
os.makedirs(viz_temp_dir)
|
80 |
-
|
81 |
try:
|
82 |
# Preprocess the dataframe
|
83 |
-
df =
|
|
|
|
|
|
|
|
|
84 |
|
85 |
# Sample if needed
|
86 |
if len(df) > 5000:
|
87 |
df = df.sample(n=5000, random_state=42)
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
print("\nColumn Types:")
|
92 |
-
print(df.dtypes)
|
93 |
-
|
94 |
-
plt.close('all')
|
95 |
-
|
96 |
-
# Create a directory for plots
|
97 |
-
plots_dir = os.path.join(viz_temp_dir, "plots")
|
98 |
-
os.makedirs(plots_dir, exist_ok=True)
|
99 |
|
100 |
-
# Run AutoViz
|
101 |
-
dfte = self.AV.AutoViz(
|
102 |
-
filename='',
|
103 |
-
sep=',',
|
104 |
-
depVar='value', # Set value as target variable
|
105 |
-
dfte=df,
|
106 |
-
header=0,
|
107 |
-
verbose=1,
|
108 |
-
lowess=False,
|
109 |
-
chart_format='html',
|
110 |
-
max_rows_analyzed=5000,
|
111 |
-
max_cols_analyzed=30,
|
112 |
-
save_plot_dir=plots_dir
|
113 |
-
)
|
114 |
-
|
115 |
# Generate summary statistics
|
116 |
numeric_cols = df.select_dtypes(include=['number']).columns
|
117 |
categorical_cols = df.select_dtypes(include=['category', 'object']).columns
|
@@ -138,20 +143,20 @@ class DataAnalyzer:
|
|
138 |
.table th {
|
139 |
background-color: #f8f9fa;
|
140 |
}
|
141 |
-
|
142 |
-
background-color: #f8f9fa;
|
143 |
-
padding: 1rem;
|
144 |
-
border-radius: 4px;
|
145 |
-
}
|
146 |
-
.viz-container {
|
147 |
margin: 20px 0;
|
148 |
-
padding:
|
149 |
border: 1px solid #ddd;
|
150 |
border-radius: 5px;
|
151 |
}
|
|
|
|
|
|
|
|
|
152 |
</style>
|
153 |
"""
|
154 |
|
|
|
155 |
html_content += f"""
|
156 |
<div class="viz-container">
|
157 |
<h2 style="text-align: center;">Data Analysis Report</h2>
|
@@ -170,26 +175,18 @@ class DataAnalyzer:
|
|
170 |
<div style="overflow-x: auto;">
|
171 |
{categorical_stats.to_html(classes='table table-striped')}
|
172 |
</div>
|
173 |
-
|
174 |
-
<h3>Column Types</h3>
|
175 |
-
<pre>{df.dtypes.to_string()}</pre>
|
176 |
</div>
|
177 |
"""
|
178 |
-
|
179 |
-
# Add plots
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
<h3>{file.replace('.html', '').replace('_', ' ').title()}</h3>
|
189 |
-
{plot_content}
|
190 |
-
</div>
|
191 |
-
"""
|
192 |
-
|
193 |
html_content += "</div>"
|
194 |
return html_content
|
195 |
|
@@ -197,24 +194,13 @@ class DataAnalyzer:
|
|
197 |
import traceback
|
198 |
error_message = f"""
|
199 |
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
|
200 |
-
<h3>Error in
|
201 |
<p>Error details: {str(e)}</p>
|
202 |
<p>Stack trace:</p>
|
203 |
<pre>{traceback.format_exc()}</pre>
|
204 |
-
<p>Dataset Info:</p>
|
205 |
-
<pre>
|
206 |
-
Rows: {len(df)}
|
207 |
-
Columns: {len(df.columns)}
|
208 |
-
Types:\n{df.dtypes.to_string()}
|
209 |
-
</pre>
|
210 |
</div>
|
211 |
"""
|
212 |
return error_message
|
213 |
-
finally:
|
214 |
-
if os.path.exists(viz_temp_dir):
|
215 |
-
shutil.rmtree(viz_temp_dir)
|
216 |
-
|
217 |
-
|
218 |
def create_interface():
|
219 |
analyzer = DataAnalyzer()
|
220 |
|
@@ -314,42 +300,41 @@ def create_interface():
|
|
314 |
outputs=[sweetviz_output]
|
315 |
)
|
316 |
|
317 |
-
# Third Tab:
|
318 |
-
with gr.TabItem("
|
319 |
with gr.Row():
|
320 |
with gr.Column(scale=2):
|
321 |
-
|
322 |
-
"Generate
|
323 |
variant="primary"
|
324 |
)
|
325 |
with gr.Column(scale=1):
|
326 |
gr.Markdown("""
|
327 |
-
###
|
328 |
-
-
|
329 |
-
-
|
330 |
-
-
|
331 |
-
-
|
332 |
-
- Time series analysis (if applicable)
|
333 |
""")
|
334 |
|
335 |
with gr.Row():
|
336 |
-
|
337 |
-
label="
|
338 |
-
value="Click the button above to generate
|
339 |
)
|
340 |
|
341 |
-
def
|
342 |
if df is None:
|
343 |
return "Please upload a dataset first"
|
344 |
try:
|
345 |
return analyzer.generate_autoviz_report(df)
|
346 |
except Exception as e:
|
347 |
-
return f"Error generating
|
348 |
|
349 |
-
|
350 |
-
fn=
|
351 |
inputs=[current_df],
|
352 |
-
outputs=[
|
353 |
)
|
354 |
|
355 |
return demo
|
|
|
12 |
import warnings
|
13 |
import io
|
14 |
import base64
|
15 |
+
from pathlib import Path
|
16 |
+
import matplotlib
|
17 |
+
matplotlib.use('Agg')
|
18 |
warnings.filterwarnings('ignore')
|
19 |
|
20 |
class DataAnalyzer:
|
|
|
22 |
self.temp_dir = tempfile.mkdtemp()
|
23 |
self.df = None
|
24 |
self.AV = AutoViz_Class()
|
25 |
+
self.plots_memory = {} # Store plots in memory
|
26 |
|
27 |
+
def save_plot_to_memory(self, fig, plot_name):
|
28 |
+
"""Save matplotlib figure to memory as base64"""
|
29 |
+
buf = io.BytesIO()
|
30 |
+
fig.savefig(buf, format='png', bbox_inches='tight')
|
31 |
+
buf.seek(0)
|
32 |
+
img_str = base64.b64encode(buf.getvalue()).decode()
|
33 |
+
self.plots_memory[plot_name] = f'data:image/png;base64,{img_str}'
|
34 |
+
plt.close(fig)
|
35 |
+
|
36 |
+
def generate_basic_plots(self, df):
|
37 |
+
"""Generate basic matplotlib plots"""
|
38 |
+
# Numeric columns distribution
|
39 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
40 |
+
for col in numeric_cols:
|
41 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
42 |
+
df[col].hist(bins=30, ax=ax)
|
43 |
+
ax.set_title(f'Distribution of {col}')
|
44 |
+
self.save_plot_to_memory(fig, f'dist_{col}')
|
45 |
+
|
46 |
+
# Box plot
|
47 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
48 |
+
df.boxplot(column=col, ax=ax)
|
49 |
+
ax.set_title(f'Box Plot of {col}')
|
50 |
+
self.save_plot_to_memory(fig, f'box_{col}')
|
51 |
+
|
52 |
+
# Categorical columns
|
53 |
+
categorical_cols = df.select_dtypes(include=['category', 'object']).columns
|
54 |
+
for col in categorical_cols:
|
55 |
+
if df[col].nunique() < 20: # Only for columns with reasonable number of categories
|
56 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
57 |
+
df[col].value_counts().plot(kind='bar', ax=ax)
|
58 |
+
ax.set_title(f'Distribution of {col}')
|
59 |
+
plt.xticks(rotation=45)
|
60 |
+
self.save_plot_to_memory(fig, f'cat_{col}')
|
61 |
+
|
62 |
+
# Correlation matrix for numeric columns
|
63 |
+
if len(numeric_cols) > 1:
|
64 |
+
fig, ax = plt.subplots(figsize=(10, 8))
|
65 |
+
correlation_matrix = df[numeric_cols].corr()
|
66 |
+
im = ax.imshow(correlation_matrix)
|
67 |
+
ax.set_xticks(range(len(numeric_cols)))
|
68 |
+
ax.set_yticks(range(len(numeric_cols)))
|
69 |
+
ax.set_xticklabels(numeric_cols, rotation=45)
|
70 |
+
ax.set_yticklabels(numeric_cols)
|
71 |
+
plt.colorbar(im)
|
72 |
+
ax.set_title('Correlation Matrix')
|
73 |
+
self.save_plot_to_memory(fig, 'correlation_matrix')
|
74 |
def generate_sweetviz_report(self, df):
|
75 |
if df is None:
|
76 |
return "Please upload a dataset first"
|
|
|
98 |
os.remove(report_path)
|
99 |
return html_with_table
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
def generate_autoviz_report(self, df):
|
102 |
if df is None:
|
103 |
return "Please upload a dataset first"
|
104 |
|
|
|
|
|
|
|
|
|
|
|
105 |
try:
|
106 |
# Preprocess the dataframe
|
107 |
+
df = df.copy()
|
108 |
+
|
109 |
+
# Convert 'value' column to numeric if possible
|
110 |
+
if 'value' in df.columns:
|
111 |
+
df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce')
|
112 |
|
113 |
# Sample if needed
|
114 |
if len(df) > 5000:
|
115 |
df = df.sample(n=5000, random_state=42)
|
116 |
|
117 |
+
# Generate basic plots
|
118 |
+
self.generate_basic_plots(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
# Generate summary statistics
|
121 |
numeric_cols = df.select_dtypes(include=['number']).columns
|
122 |
categorical_cols = df.select_dtypes(include=['category', 'object']).columns
|
|
|
143 |
.table th {
|
144 |
background-color: #f8f9fa;
|
145 |
}
|
146 |
+
.plot-container {
|
|
|
|
|
|
|
|
|
|
|
147 |
margin: 20px 0;
|
148 |
+
padding: 10px;
|
149 |
border: 1px solid #ddd;
|
150 |
border-radius: 5px;
|
151 |
}
|
152 |
+
.plot-container img {
|
153 |
+
max-width: 100%;
|
154 |
+
height: auto;
|
155 |
+
}
|
156 |
</style>
|
157 |
"""
|
158 |
|
159 |
+
# Add summary statistics
|
160 |
html_content += f"""
|
161 |
<div class="viz-container">
|
162 |
<h2 style="text-align: center;">Data Analysis Report</h2>
|
|
|
175 |
<div style="overflow-x: auto;">
|
176 |
{categorical_stats.to_html(classes='table table-striped')}
|
177 |
</div>
|
|
|
|
|
|
|
178 |
</div>
|
179 |
"""
|
180 |
+
|
181 |
+
# Add plots from memory
|
182 |
+
for plot_name, plot_data in self.plots_memory.items():
|
183 |
+
html_content += f"""
|
184 |
+
<div class="plot-container">
|
185 |
+
<h3>{plot_name.replace('_', ' ').title()}</h3>
|
186 |
+
<img src="{plot_data}" alt="{plot_name}">
|
187 |
+
</div>
|
188 |
+
"""
|
189 |
+
|
|
|
|
|
|
|
|
|
|
|
190 |
html_content += "</div>"
|
191 |
return html_content
|
192 |
|
|
|
194 |
import traceback
|
195 |
error_message = f"""
|
196 |
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
|
197 |
+
<h3>Error in Analysis</h3>
|
198 |
<p>Error details: {str(e)}</p>
|
199 |
<p>Stack trace:</p>
|
200 |
<pre>{traceback.format_exc()}</pre>
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
</div>
|
202 |
"""
|
203 |
return error_message
|
|
|
|
|
|
|
|
|
|
|
204 |
def create_interface():
|
205 |
analyzer = DataAnalyzer()
|
206 |
|
|
|
300 |
outputs=[sweetviz_output]
|
301 |
)
|
302 |
|
303 |
+
# Third Tab: Visual Analysis
|
304 |
+
with gr.TabItem("Visual Analysis"):
|
305 |
with gr.Row():
|
306 |
with gr.Column(scale=2):
|
307 |
+
viz_button = gr.Button(
|
308 |
+
"Generate Visualizations",
|
309 |
variant="primary"
|
310 |
)
|
311 |
with gr.Column(scale=1):
|
312 |
gr.Markdown("""
|
313 |
+
### Visualization Features
|
314 |
+
- Distribution plots
|
315 |
+
- Correlation analysis
|
316 |
+
- Categorical variable analysis
|
317 |
+
- Statistical summaries
|
|
|
318 |
""")
|
319 |
|
320 |
with gr.Row():
|
321 |
+
viz_output = gr.HTML(
|
322 |
+
label="Visualization Report",
|
323 |
+
value="Click the button above to generate visualizations"
|
324 |
)
|
325 |
|
326 |
+
def generate_viz(df):
|
327 |
if df is None:
|
328 |
return "Please upload a dataset first"
|
329 |
try:
|
330 |
return analyzer.generate_autoviz_report(df)
|
331 |
except Exception as e:
|
332 |
+
return f"Error generating visualizations: {str(e)}"
|
333 |
|
334 |
+
viz_button.click(
|
335 |
+
fn=generate_viz,
|
336 |
inputs=[current_df],
|
337 |
+
outputs=[viz_output]
|
338 |
)
|
339 |
|
340 |
return demo
|