csv-plus-plus / app.py
baconnier's picture
Update app.py
771365f verified
raw
history blame
5.37 kB
import gradio as gr
import pandas as pd
import sweetviz as sv
import tempfile
import os
import category_encoders as ce
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import numpy as np
import io
import base64
class DataAnalyzer:
def __init__(self):
self.temp_dir = tempfile.mkdtemp()
self.df = None
def generate_sweetviz_report(self, df):
self.df = df # Store DataFrame for other analyses
report = sv.analyze(df)
report_path = os.path.join(self.temp_dir, "report.html")
report.show_html(report_path, open_browser=False)
with open(report_path, 'r', encoding='utf-8') as f:
html_content = f.read()
html_with_table = f"""
<table width="100%" style="border-collapse: collapse;">
<tr>
<td style="padding: 20px; border: 1px solid #ddd;">
<div style="height: 800px; overflow: auto;">
{html_content}
</div>
</td>
</tr>
</table>
"""
os.remove(report_path)
return html_with_table
def encode_and_visualize(self, column_name, encoder_type='binary'):
if self.df is None or column_name not in self.df.columns:
return None
# Create DataFrame with only the selected column
df_subset = self.df[[column_name]].copy()
# Select encoder
encoders = {
'binary': ce.BinaryEncoder(),
'onehot': ce.OneHotEncoder(),
'catboost': ce.CatBoostEncoder(),
'count': ce.CountEncoder()
}
encoder = encoders.get(encoder_type)
# Encode data
encoded_df = encoder.fit_transform(df_subset)
# Scale the encoded features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(encoded_df)
# Apply UMAP
reducer = umap.UMAP(
n_neighbors=15,
min_dist=0.1,
n_components=2,
random_state=42
)
embedding = reducer.fit_transform(scaled_data)
# Create visualization
plt.figure(figsize=(10, 6))
scatter = plt.scatter(
embedding[:, 0],
embedding[:, 1],
c=pd.factorize(df_subset[column_name])[0],
cmap='viridis',
alpha=0.6
)
plt.colorbar(scatter)
plt.title(f'UMAP visualization of {column_name}\nusing {encoder_type} encoding')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
# Save plot to bytes
buf = io.BytesIO()
plt.savefig(buf, format='png', bbox_inches='tight')
plt.close()
buf.seek(0)
return buf
def create_interface():
analyzer = DataAnalyzer()
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# Data Analysis Dashboard")
with gr.Tabs():
with gr.TabItem("Sweetviz Analysis"):
file_input = gr.File(label="Upload CSV")
report_html = gr.HTML()
with gr.TabItem("Categorical Analysis"):
with gr.Row():
column_dropdown = gr.Dropdown(
label="Select Categorical Column",
choices=[],
interactive=True
)
encoder_dropdown = gr.Dropdown(
label="Select Encoder",
choices=['binary', 'onehot', 'catboost', 'count'],
value='binary',
interactive=True
)
plot_output = gr.Image(label="UMAP Visualization")
def process_file(file):
if file is None:
return None, gr.Dropdown(choices=[])
try:
df = pd.read_csv(file.name)
# Get categorical columns
cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
return analyzer.generate_sweetviz_report(df), gr.Dropdown(choices=cat_columns)
except Exception as e:
return f"Error generating report: {str(e)}", gr.Dropdown(choices=[])
def update_plot(column, encoder_type):
if column is None:
return None
try:
plot_bytes = analyzer.encode_and_visualize(column, encoder_type)
return plot_bytes
except Exception as e:
return None
file_input.change(
fn=process_file,
inputs=[file_input],
outputs=[report_html, column_dropdown]
)
column_dropdown.change(
fn=update_plot,
inputs=[column_dropdown, encoder_dropdown],
outputs=[plot_output]
)
encoder_dropdown.change(
fn=update_plot,
inputs=[column_dropdown, encoder_dropdown],
outputs=[plot_output]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(show_error=True)