Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -45,6 +45,40 @@ class DataAnalyzer:
|
|
45 |
os.remove(report_path)
|
46 |
return html_with_table
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def generate_autoviz_report(self, df):
|
49 |
if df is None:
|
50 |
return "Please upload a dataset first"
|
@@ -55,40 +89,13 @@ class DataAnalyzer:
|
|
55 |
os.makedirs(viz_temp_dir)
|
56 |
|
57 |
try:
|
58 |
-
#
|
59 |
-
df =
|
60 |
-
|
61 |
-
# Handle datetime columns
|
62 |
-
for col in df.columns:
|
63 |
-
try:
|
64 |
-
df[col] = pd.to_datetime(df[col], errors='ignore')
|
65 |
-
except:
|
66 |
-
pass
|
67 |
-
|
68 |
-
datetime_columns = df.select_dtypes(include=['datetime64']).columns
|
69 |
-
for col in datetime_columns:
|
70 |
-
df[f'{col}_year'] = df[col].dt.year
|
71 |
-
df[f'{col}_month'] = df[col].dt.month
|
72 |
-
df = df.drop(columns=[col])
|
73 |
-
|
74 |
-
# Try to convert string columns to numeric where possible
|
75 |
-
for col in df.select_dtypes(include=['object']).columns:
|
76 |
-
try:
|
77 |
-
df[col] = pd.to_numeric(df[col], errors='ignore')
|
78 |
-
except:
|
79 |
-
pass
|
80 |
-
|
81 |
-
# Convert remaining string columns to categorical if cardinality is low
|
82 |
-
object_columns = df.select_dtypes(include=['object']).columns
|
83 |
-
for col in object_columns:
|
84 |
-
if df[col].nunique() < 50:
|
85 |
-
df[col] = df[col].astype('category')
|
86 |
|
87 |
-
# Sample
|
88 |
if len(df) > 5000:
|
89 |
df = df.sample(n=5000, random_state=42)
|
90 |
|
91 |
-
# Print data info for debugging
|
92 |
print("\nDataset Info:")
|
93 |
print(df.info())
|
94 |
print("\nColumn Types:")
|
@@ -96,16 +103,16 @@ class DataAnalyzer:
|
|
96 |
|
97 |
plt.close('all')
|
98 |
|
99 |
-
# Run AutoViz
|
100 |
dfte = self.AV.AutoViz(
|
101 |
filename='',
|
102 |
sep=',',
|
103 |
-
depVar='',
|
104 |
dfte=df,
|
105 |
header=0,
|
106 |
verbose=1,
|
107 |
lowess=False,
|
108 |
-
chart_format='
|
109 |
max_rows_analyzed=5000,
|
110 |
max_cols_analyzed=30,
|
111 |
save_plot_dir=viz_temp_dir
|
@@ -115,7 +122,7 @@ class DataAnalyzer:
|
|
115 |
html_parts = []
|
116 |
if os.path.exists(viz_temp_dir):
|
117 |
for file in sorted(os.listdir(viz_temp_dir)):
|
118 |
-
if file.endswith('.html')
|
119 |
file_path = os.path.join(viz_temp_dir, file)
|
120 |
try:
|
121 |
with open(file_path, 'r', encoding='utf-8') as f:
|
@@ -125,22 +132,23 @@ class DataAnalyzer:
|
|
125 |
except Exception as e:
|
126 |
print(f"Error reading file {file}: {str(e)}")
|
127 |
|
|
|
|
|
|
|
|
|
128 |
if not html_parts:
|
129 |
return f"""
|
130 |
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
|
131 |
<h3>Data Summary</h3>
|
132 |
<p>Total Rows: {len(df)}</p>
|
133 |
<p>Total Columns: {len(df.columns)}</p>
|
134 |
-
<
|
135 |
-
|
|
|
|
|
136 |
<hr>
|
137 |
-
<h3>
|
138 |
-
<
|
139 |
-
<ul>
|
140 |
-
<li>All columns being categorical with high cardinality</li>
|
141 |
-
<li>No numeric columns for analysis</li>
|
142 |
-
<li>Data format not suitable for visualization</li>
|
143 |
-
</ul>
|
144 |
</div>
|
145 |
"""
|
146 |
|
@@ -151,8 +159,10 @@ class DataAnalyzer:
|
|
151 |
<h3>Dataset Summary</h3>
|
152 |
<p>Rows analyzed: {len(df)}</p>
|
153 |
<p>Columns: {len(df.columns)}</p>
|
154 |
-
<
|
155 |
-
|
|
|
|
|
156 |
</div>
|
157 |
<hr>
|
158 |
{'<hr>'.join(html_parts)}
|
@@ -197,6 +207,13 @@ def create_interface():
|
|
197 |
with gr.Row():
|
198 |
file_input = gr.File(label="Upload CSV")
|
199 |
data_preview = gr.Dataframe(label="Data Preview", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
def load_data(file):
|
202 |
if file is None:
|
@@ -218,6 +235,14 @@ def create_interface():
|
|
218 |
with gr.Row():
|
219 |
sweetviz_button = gr.Button("Generate Sweetviz Report")
|
220 |
sweetviz_output = gr.HTML(label="Sweetviz Report")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
def generate_sweetviz(df):
|
223 |
if df is None:
|
@@ -235,6 +260,14 @@ def create_interface():
|
|
235 |
with gr.Row():
|
236 |
autoviz_button = gr.Button("Generate AutoViz Report")
|
237 |
autoviz_output = gr.HTML(label="AutoViz Report")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
def generate_autoviz(df):
|
240 |
if df is None:
|
|
|
45 |
os.remove(report_path)
|
46 |
return html_with_table
|
47 |
|
48 |
+
def preprocess_dataframe(self, df):
|
49 |
+
"""Preprocess dataframe for visualization"""
|
50 |
+
df = df.copy()
|
51 |
+
|
52 |
+
# Convert 'value' column to numeric if possible
|
53 |
+
try:
|
54 |
+
# Remove any currency symbols and commas
|
55 |
+
df['value'] = df['value'].replace('[\$,]', '', regex=True)
|
56 |
+
# Convert to float
|
57 |
+
df['value'] = pd.to_numeric(df['value'], errors='coerce')
|
58 |
+
except:
|
59 |
+
pass
|
60 |
+
|
61 |
+
# Handle datetime columns
|
62 |
+
for col in df.columns:
|
63 |
+
if df[col].dtype == 'object':
|
64 |
+
try:
|
65 |
+
df[col] = pd.to_datetime(df[col], errors='ignore')
|
66 |
+
except:
|
67 |
+
pass
|
68 |
+
|
69 |
+
datetime_columns = df.select_dtypes(include=['datetime64']).columns
|
70 |
+
for col in datetime_columns:
|
71 |
+
df[f'{col}_year'] = df[col].dt.year
|
72 |
+
df[f'{col}_month'] = df[col].dt.month
|
73 |
+
df = df.drop(columns=[col])
|
74 |
+
|
75 |
+
# Convert categorical columns with low cardinality
|
76 |
+
for col in df.select_dtypes(include=['object']).columns:
|
77 |
+
if df[col].nunique() < 50:
|
78 |
+
df[col] = df[col].astype('category')
|
79 |
+
|
80 |
+
return df
|
81 |
+
|
82 |
def generate_autoviz_report(self, df):
|
83 |
if df is None:
|
84 |
return "Please upload a dataset first"
|
|
|
89 |
os.makedirs(viz_temp_dir)
|
90 |
|
91 |
try:
|
92 |
+
# Preprocess the dataframe
|
93 |
+
df = self.preprocess_dataframe(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
# Sample if needed
|
96 |
if len(df) > 5000:
|
97 |
df = df.sample(n=5000, random_state=42)
|
98 |
|
|
|
99 |
print("\nDataset Info:")
|
100 |
print(df.info())
|
101 |
print("\nColumn Types:")
|
|
|
103 |
|
104 |
plt.close('all')
|
105 |
|
106 |
+
# Run AutoViz with modified settings
|
107 |
dfte = self.AV.AutoViz(
|
108 |
filename='',
|
109 |
sep=',',
|
110 |
+
depVar='value', # Set value as target variable
|
111 |
dfte=df,
|
112 |
header=0,
|
113 |
verbose=1,
|
114 |
lowess=False,
|
115 |
+
chart_format='html', # Changed back to html
|
116 |
max_rows_analyzed=5000,
|
117 |
max_cols_analyzed=30,
|
118 |
save_plot_dir=viz_temp_dir
|
|
|
122 |
html_parts = []
|
123 |
if os.path.exists(viz_temp_dir):
|
124 |
for file in sorted(os.listdir(viz_temp_dir)):
|
125 |
+
if file.endswith('.html'):
|
126 |
file_path = os.path.join(viz_temp_dir, file)
|
127 |
try:
|
128 |
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
132 |
except Exception as e:
|
133 |
print(f"Error reading file {file}: {str(e)}")
|
134 |
|
135 |
+
# Generate summary statistics
|
136 |
+
numeric_summary = df.describe().to_html() if not df.select_dtypes(include=['number']).empty else ""
|
137 |
+
categorical_summary = df.describe(include=['category', 'object']).to_html() if not df.select_dtypes(include=['category', 'object']).empty else ""
|
138 |
+
|
139 |
if not html_parts:
|
140 |
return f"""
|
141 |
<div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
|
142 |
<h3>Data Summary</h3>
|
143 |
<p>Total Rows: {len(df)}</p>
|
144 |
<p>Total Columns: {len(df.columns)}</p>
|
145 |
+
<h4>Numeric Summary:</h4>
|
146 |
+
{numeric_summary}
|
147 |
+
<h4>Categorical Summary:</h4>
|
148 |
+
{categorical_summary}
|
149 |
<hr>
|
150 |
+
<h3>Column Types:</h3>
|
151 |
+
<pre>{df.dtypes.to_string()}</pre>
|
|
|
|
|
|
|
|
|
|
|
152 |
</div>
|
153 |
"""
|
154 |
|
|
|
159 |
<h3>Dataset Summary</h3>
|
160 |
<p>Rows analyzed: {len(df)}</p>
|
161 |
<p>Columns: {len(df.columns)}</p>
|
162 |
+
<h4>Numeric Summary:</h4>
|
163 |
+
{numeric_summary}
|
164 |
+
<h4>Categorical Summary:</h4>
|
165 |
+
{categorical_summary}
|
166 |
</div>
|
167 |
<hr>
|
168 |
{'<hr>'.join(html_parts)}
|
|
|
207 |
with gr.Row():
|
208 |
file_input = gr.File(label="Upload CSV")
|
209 |
data_preview = gr.Dataframe(label="Data Preview", interactive=False)
|
210 |
+
with gr.Row():
|
211 |
+
gr.Markdown("""
|
212 |
+
### Data Preview Info
|
213 |
+
- Upload a CSV file to begin analysis
|
214 |
+
- First few rows will be shown here
|
215 |
+
- Data types and basic statistics will be displayed
|
216 |
+
""")
|
217 |
|
218 |
def load_data(file):
|
219 |
if file is None:
|
|
|
235 |
with gr.Row():
|
236 |
sweetviz_button = gr.Button("Generate Sweetviz Report")
|
237 |
sweetviz_output = gr.HTML(label="Sweetviz Report")
|
238 |
+
with gr.Row():
|
239 |
+
gr.Markdown("""
|
240 |
+
### Sweetviz Analysis Info
|
241 |
+
- Comprehensive data profiling
|
242 |
+
- Statistical analysis
|
243 |
+
- Feature correlations
|
244 |
+
- Missing value analysis
|
245 |
+
""")
|
246 |
|
247 |
def generate_sweetviz(df):
|
248 |
if df is None:
|
|
|
260 |
with gr.Row():
|
261 |
autoviz_button = gr.Button("Generate AutoViz Report")
|
262 |
autoviz_output = gr.HTML(label="AutoViz Report")
|
263 |
+
with gr.Row():
|
264 |
+
gr.Markdown("""
|
265 |
+
### AutoViz Analysis Info
|
266 |
+
- Automated visualization generation
|
267 |
+
- Distribution analysis
|
268 |
+
- Correlation plots
|
269 |
+
- Feature relationships
|
270 |
+
""")
|
271 |
|
272 |
def generate_autoviz(df):
|
273 |
if df is None:
|