Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import sweetviz as sv | |
import tempfile | |
import os | |
import category_encoders as ce | |
import umap | |
import matplotlib.pyplot as plt | |
from sklearn.preprocessing import StandardScaler | |
from autoviz.AutoViz_Class import AutoViz_Class | |
import shutil | |
import warnings | |
import io | |
import base64 | |
from pathlib import Path | |
import matplotlib | |
matplotlib.use('Agg') | |
warnings.filterwarnings('ignore') | |
class DataAnalyzer: | |
def __init__(self): | |
self.temp_dir = tempfile.mkdtemp() | |
self.df = None | |
self.AV = AutoViz_Class() | |
self.plots_memory = {} # Store plots in memory | |
def save_plot_to_memory(self, fig, plot_name): | |
"""Save matplotlib figure to memory as base64""" | |
buf = io.BytesIO() | |
fig.savefig(buf, format='png', bbox_inches='tight') | |
buf.seek(0) | |
img_str = base64.b64encode(buf.getvalue()).decode() | |
self.plots_memory[plot_name] = f'data:image/png;base64,{img_str}' | |
plt.close(fig) | |
def generate_basic_plots(self, df): | |
"""Generate basic matplotlib plots""" | |
# Numeric columns distribution | |
numeric_cols = df.select_dtypes(include=['number']).columns | |
for col in numeric_cols: | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
df[col].hist(bins=30, ax=ax) | |
ax.set_title(f'Distribution of {col}') | |
self.save_plot_to_memory(fig, f'dist_{col}') | |
# Box plot | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
df.boxplot(column=col, ax=ax) | |
ax.set_title(f'Box Plot of {col}') | |
self.save_plot_to_memory(fig, f'box_{col}') | |
# Categorical columns | |
categorical_cols = df.select_dtypes(include=['category', 'object']).columns | |
for col in categorical_cols: | |
if df[col].nunique() < 20: # Only for columns with reasonable number of categories | |
fig, ax = plt.subplots(figsize=(12, 6)) | |
df[col].value_counts().plot(kind='bar', ax=ax) | |
ax.set_title(f'Distribution of {col}') | |
plt.xticks(rotation=45) | |
self.save_plot_to_memory(fig, f'cat_{col}') | |
# Correlation matrix for numeric columns | |
if len(numeric_cols) > 1: | |
fig, ax = plt.subplots(figsize=(10, 8)) | |
correlation_matrix = df[numeric_cols].corr() | |
im = ax.imshow(correlation_matrix) | |
ax.set_xticks(range(len(numeric_cols))) | |
ax.set_yticks(range(len(numeric_cols))) | |
ax.set_xticklabels(numeric_cols, rotation=45) | |
ax.set_yticklabels(numeric_cols) | |
plt.colorbar(im) | |
ax.set_title('Correlation Matrix') | |
self.save_plot_to_memory(fig, 'correlation_matrix') | |
def generate_sweetviz_report(self, df): | |
if df is None: | |
return "Please upload a dataset first" | |
self.df = df | |
report = sv.analyze(df) | |
report_path = os.path.join(self.temp_dir, "report.html") | |
report.show_html(report_path, open_browser=False) | |
with open(report_path, 'r', encoding='utf-8') as f: | |
html_content = f.read() | |
html_with_table = f""" | |
<table width="100%" style="border-collapse: collapse;"> | |
<tr> | |
<td style="padding: 20px; border: 1px solid #ddd;"> | |
<div style="height: 800px; overflow: auto;"> | |
{html_content} | |
</div> | |
</td> | |
</tr> | |
</table> | |
""" | |
os.remove(report_path) | |
return html_with_table | |
def generate_autoviz_report(self, df): | |
if df is None: | |
return "Please upload a dataset first" | |
try: | |
# Preprocess the dataframe | |
df = df.copy() | |
# Convert 'value' column to numeric if possible | |
if 'value' in df.columns: | |
df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce') | |
# Sample if needed | |
if len(df) > 5000: | |
df = df.sample(n=5000, random_state=42) | |
# Generate basic plots | |
self.generate_basic_plots(df) | |
# Generate summary statistics | |
numeric_cols = df.select_dtypes(include=['number']).columns | |
categorical_cols = df.select_dtypes(include=['category', 'object']).columns | |
numeric_stats = df[numeric_cols].describe().round(2) if len(numeric_cols) > 0 else pd.DataFrame() | |
categorical_stats = df[categorical_cols].describe() if len(categorical_cols) > 0 else pd.DataFrame() | |
# Create HTML content with styling | |
html_content = """ | |
<style> | |
.table { | |
width: 100%; | |
margin-bottom: 1rem; | |
color: #212529; | |
border-collapse: collapse; | |
} | |
.table-striped tbody tr:nth-of-type(odd) { | |
background-color: rgba(0,0,0,.05); | |
} | |
.table td, .table th { | |
padding: .75rem; | |
border: 1px solid #dee2e6; | |
} | |
.table th { | |
background-color: #f8f9fa; | |
} | |
.plot-container { | |
margin: 20px 0; | |
padding: 10px; | |
border: 1px solid #ddd; | |
border-radius: 5px; | |
} | |
.plot-container img { | |
max-width: 100%; | |
height: auto; | |
} | |
</style> | |
""" | |
# Add summary statistics | |
html_content += f""" | |
<div class="viz-container"> | |
<h2 style="text-align: center;">Data Analysis Report</h2> | |
<div style="margin: 20px;"> | |
<h3>Dataset Overview</h3> | |
<p>Total Rows: {len(df)}</p> | |
<p>Total Columns: {len(df.columns)}</p> | |
<h3>Numeric Variables Summary</h3> | |
<div style="overflow-x: auto;"> | |
{numeric_stats.to_html(classes='table table-striped')} | |
</div> | |
<h3>Categorical Variables Summary</h3> | |
<div style="overflow-x: auto;"> | |
{categorical_stats.to_html(classes='table table-striped')} | |
</div> | |
</div> | |
""" | |
# Add plots from memory | |
for plot_name, plot_data in self.plots_memory.items(): | |
html_content += f""" | |
<div class="plot-container"> | |
<h3>{plot_name.replace('_', ' ').title()}</h3> | |
<img src="{plot_data}" alt="{plot_name}"> | |
</div> | |
""" | |
html_content += "</div>" | |
return html_content | |
except Exception as e: | |
import traceback | |
error_message = f""" | |
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;"> | |
<h3>Error in Analysis</h3> | |
<p>Error details: {str(e)}</p> | |
<p>Stack trace:</p> | |
<pre>{traceback.format_exc()}</pre> | |
</div> | |
""" | |
return error_message | |
def create_interface(): | |
analyzer = DataAnalyzer() | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown(""" | |
# Data Analysis Dashboard | |
This dashboard provides comprehensive data analysis and visualization capabilities. | |
""") | |
# Store the dataframe in a state variable | |
current_df = gr.State(None) | |
with gr.Tabs(): | |
# First Tab: Data Upload & Preview | |
with gr.TabItem("Data Upload & Preview"): | |
with gr.Row(): | |
with gr.Column(scale=2): | |
file_input = gr.File( | |
label="Upload CSV File", | |
file_types=[".csv"], | |
file_count="single" | |
) | |
with gr.Column(scale=1): | |
gr.Markdown(""" | |
### Upload Instructions | |
1. Select a CSV file | |
2. File will be automatically loaded | |
3. Preview will appear below | |
""") | |
with gr.Row(): | |
data_info = gr.Markdown("No data uploaded yet") | |
with gr.Row(): | |
data_preview = gr.Dataframe( | |
label="Data Preview", | |
interactive=False, | |
wrap=True | |
) | |
def load_data(file): | |
if file is None: | |
return "No data uploaded yet", None, None | |
try: | |
df = pd.read_csv(file.name) | |
info_text = f""" | |
### Dataset Information | |
- Rows: {len(df)} | |
- Columns: {len(df.columns)} | |
- Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB | |
- Column Types: {dict(df.dtypes.value_counts())} | |
""" | |
return info_text, df.head(10), df | |
except Exception as e: | |
return f"Error loading file: {str(e)}", None, None | |
file_input.change( | |
fn=load_data, | |
inputs=[file_input], | |
outputs=[data_info, data_preview, current_df] | |
) | |
# Second Tab: Sweetviz Analysis | |
with gr.TabItem("Sweetviz Analysis"): | |
with gr.Row(): | |
with gr.Column(scale=2): | |
sweetviz_button = gr.Button( | |
"Generate Sweetviz Report", | |
variant="primary" | |
) | |
with gr.Column(scale=1): | |
gr.Markdown(""" | |
### Sweetviz Analysis Features | |
- Comprehensive data profiling | |
- Statistical analysis | |
- Feature correlations | |
- Missing value analysis | |
""") | |
with gr.Row(): | |
sweetviz_output = gr.HTML( | |
label="Sweetviz Report", | |
value="Click the button above to generate the report" | |
) | |
def generate_sweetviz(df): | |
if df is None: | |
return "Please upload a dataset first" | |
try: | |
return analyzer.generate_sweetviz_report(df) | |
except Exception as e: | |
return f"Error generating Sweetviz report: {str(e)}" | |
sweetviz_button.click( | |
fn=generate_sweetviz, | |
inputs=[current_df], | |
outputs=[sweetviz_output] | |
) | |
# Third Tab: Visual Analysis | |
with gr.TabItem("Visual Analysis"): | |
with gr.Row(): | |
with gr.Column(scale=2): | |
viz_button = gr.Button( | |
"Generate Visualizations", | |
variant="primary" | |
) | |
with gr.Column(scale=1): | |
gr.Markdown(""" | |
### Visualization Features | |
- Distribution plots | |
- Correlation analysis | |
- Categorical variable analysis | |
- Statistical summaries | |
""") | |
with gr.Row(): | |
viz_output = gr.HTML( | |
label="Visualization Report", | |
value="Click the button above to generate visualizations" | |
) | |
def generate_viz(df): | |
if df is None: | |
return "Please upload a dataset first" | |
try: | |
return analyzer.generate_autoviz_report(df) | |
except Exception as e: | |
return f"Error generating visualizations: {str(e)}" | |
viz_button.click( | |
fn=generate_viz, | |
inputs=[current_df], | |
outputs=[viz_output] | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
show_error=True, | |
share=False # Set to True if you want to create a public link | |
) |