csv-plus-plus / app.py
baconnier's picture
Update app.py
96823ba verified
import gradio as gr
import pandas as pd
import sweetviz as sv
import tempfile
import os
import category_encoders as ce
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from autoviz.AutoViz_Class import AutoViz_Class
import shutil
import warnings
import io
import base64
from pathlib import Path
import matplotlib
matplotlib.use('Agg')
warnings.filterwarnings('ignore')
class DataAnalyzer:
def __init__(self):
self.temp_dir = tempfile.mkdtemp()
self.df = None
self.AV = AutoViz_Class()
self.plots_memory = {} # Store plots in memory
def save_plot_to_memory(self, fig, plot_name):
"""Save matplotlib figure to memory as base64"""
buf = io.BytesIO()
fig.savefig(buf, format='png', bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.getvalue()).decode()
self.plots_memory[plot_name] = f'data:image/png;base64,{img_str}'
plt.close(fig)
def generate_basic_plots(self, df):
"""Generate basic matplotlib plots"""
# Numeric columns distribution
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
fig, ax = plt.subplots(figsize=(10, 6))
df[col].hist(bins=30, ax=ax)
ax.set_title(f'Distribution of {col}')
self.save_plot_to_memory(fig, f'dist_{col}')
# Box plot
fig, ax = plt.subplots(figsize=(10, 6))
df.boxplot(column=col, ax=ax)
ax.set_title(f'Box Plot of {col}')
self.save_plot_to_memory(fig, f'box_{col}')
# Categorical columns
categorical_cols = df.select_dtypes(include=['category', 'object']).columns
for col in categorical_cols:
if df[col].nunique() < 20: # Only for columns with reasonable number of categories
fig, ax = plt.subplots(figsize=(12, 6))
df[col].value_counts().plot(kind='bar', ax=ax)
ax.set_title(f'Distribution of {col}')
plt.xticks(rotation=45)
self.save_plot_to_memory(fig, f'cat_{col}')
# Correlation matrix for numeric columns
if len(numeric_cols) > 1:
fig, ax = plt.subplots(figsize=(10, 8))
correlation_matrix = df[numeric_cols].corr()
im = ax.imshow(correlation_matrix)
ax.set_xticks(range(len(numeric_cols)))
ax.set_yticks(range(len(numeric_cols)))
ax.set_xticklabels(numeric_cols, rotation=45)
ax.set_yticklabels(numeric_cols)
plt.colorbar(im)
ax.set_title('Correlation Matrix')
self.save_plot_to_memory(fig, 'correlation_matrix')
def generate_sweetviz_report(self, df):
if df is None:
return "Please upload a dataset first"
self.df = df
report = sv.analyze(df)
report_path = os.path.join(self.temp_dir, "report.html")
report.show_html(report_path, open_browser=False)
with open(report_path, 'r', encoding='utf-8') as f:
html_content = f.read()
html_with_table = f"""
<table width="100%" style="border-collapse: collapse;">
<tr>
<td style="padding: 20px; border: 1px solid #ddd;">
<div style="height: 800px; overflow: auto;">
{html_content}
</div>
</td>
</tr>
</table>
"""
os.remove(report_path)
return html_with_table
def generate_autoviz_report(self, df):
if df is None:
return "Please upload a dataset first"
try:
# Preprocess the dataframe
df = df.copy()
# Convert 'value' column to numeric if possible
if 'value' in df.columns:
df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce')
# Sample if needed
if len(df) > 5000:
df = df.sample(n=5000, random_state=42)
# Generate basic plots
self.generate_basic_plots(df)
# Generate summary statistics
numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['category', 'object']).columns
numeric_stats = df[numeric_cols].describe().round(2) if len(numeric_cols) > 0 else pd.DataFrame()
categorical_stats = df[categorical_cols].describe() if len(categorical_cols) > 0 else pd.DataFrame()
# Create HTML content with styling
html_content = """
<style>
.table {
width: 100%;
margin-bottom: 1rem;
color: #212529;
border-collapse: collapse;
}
.table-striped tbody tr:nth-of-type(odd) {
background-color: rgba(0,0,0,.05);
}
.table td, .table th {
padding: .75rem;
border: 1px solid #dee2e6;
}
.table th {
background-color: #f8f9fa;
}
.plot-container {
margin: 20px 0;
padding: 10px;
border: 1px solid #ddd;
border-radius: 5px;
}
.plot-container img {
max-width: 100%;
height: auto;
}
</style>
"""
# Add summary statistics
html_content += f"""
<div class="viz-container">
<h2 style="text-align: center;">Data Analysis Report</h2>
<div style="margin: 20px;">
<h3>Dataset Overview</h3>
<p>Total Rows: {len(df)}</p>
<p>Total Columns: {len(df.columns)}</p>
<h3>Numeric Variables Summary</h3>
<div style="overflow-x: auto;">
{numeric_stats.to_html(classes='table table-striped')}
</div>
<h3>Categorical Variables Summary</h3>
<div style="overflow-x: auto;">
{categorical_stats.to_html(classes='table table-striped')}
</div>
</div>
"""
# Add plots from memory
for plot_name, plot_data in self.plots_memory.items():
html_content += f"""
<div class="plot-container">
<h3>{plot_name.replace('_', ' ').title()}</h3>
<img src="{plot_data}" alt="{plot_name}">
</div>
"""
html_content += "</div>"
return html_content
except Exception as e:
import traceback
error_message = f"""
<div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
<h3>Error in Analysis</h3>
<p>Error details: {str(e)}</p>
<p>Stack trace:</p>
<pre>{traceback.format_exc()}</pre>
</div>
"""
return error_message
def create_interface():
analyzer = DataAnalyzer()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# Data Analysis Dashboard
This dashboard provides comprehensive data analysis and visualization capabilities.
""")
# Store the dataframe in a state variable
current_df = gr.State(None)
with gr.Tabs():
# First Tab: Data Upload & Preview
with gr.TabItem("Data Upload & Preview"):
with gr.Row():
with gr.Column(scale=2):
file_input = gr.File(
label="Upload CSV File",
file_types=[".csv"],
file_count="single"
)
with gr.Column(scale=1):
gr.Markdown("""
### Upload Instructions
1. Select a CSV file
2. File will be automatically loaded
3. Preview will appear below
""")
with gr.Row():
data_info = gr.Markdown("No data uploaded yet")
with gr.Row():
data_preview = gr.Dataframe(
label="Data Preview",
interactive=False,
wrap=True
)
def load_data(file):
if file is None:
return "No data uploaded yet", None, None
try:
df = pd.read_csv(file.name)
info_text = f"""
### Dataset Information
- Rows: {len(df)}
- Columns: {len(df.columns)}
- Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB
- Column Types: {dict(df.dtypes.value_counts())}
"""
return info_text, df.head(10), df
except Exception as e:
return f"Error loading file: {str(e)}", None, None
file_input.change(
fn=load_data,
inputs=[file_input],
outputs=[data_info, data_preview, current_df]
)
# Second Tab: Sweetviz Analysis
with gr.TabItem("Sweetviz Analysis"):
with gr.Row():
with gr.Column(scale=2):
sweetviz_button = gr.Button(
"Generate Sweetviz Report",
variant="primary"
)
with gr.Column(scale=1):
gr.Markdown("""
### Sweetviz Analysis Features
- Comprehensive data profiling
- Statistical analysis
- Feature correlations
- Missing value analysis
""")
with gr.Row():
sweetviz_output = gr.HTML(
label="Sweetviz Report",
value="Click the button above to generate the report"
)
def generate_sweetviz(df):
if df is None:
return "Please upload a dataset first"
try:
return analyzer.generate_sweetviz_report(df)
except Exception as e:
return f"Error generating Sweetviz report: {str(e)}"
sweetviz_button.click(
fn=generate_sweetviz,
inputs=[current_df],
outputs=[sweetviz_output]
)
# Third Tab: Visual Analysis
with gr.TabItem("Visual Analysis"):
with gr.Row():
with gr.Column(scale=2):
viz_button = gr.Button(
"Generate Visualizations",
variant="primary"
)
with gr.Column(scale=1):
gr.Markdown("""
### Visualization Features
- Distribution plots
- Correlation analysis
- Categorical variable analysis
- Statistical summaries
""")
with gr.Row():
viz_output = gr.HTML(
label="Visualization Report",
value="Click the button above to generate visualizations"
)
def generate_viz(df):
if df is None:
return "Please upload a dataset first"
try:
return analyzer.generate_autoviz_report(df)
except Exception as e:
return f"Error generating visualizations: {str(e)}"
viz_button.click(
fn=generate_viz,
inputs=[current_df],
outputs=[viz_output]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
share=False # Set to True if you want to create a public link
)