import gradio as gr import pandas as pd import sweetviz as sv import tempfile import os import category_encoders as ce import umap import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler import seaborn as sns import numpy as np import io import base64 class DataAnalyzer: def __init__(self): self.temp_dir = tempfile.mkdtemp() self.df = None def generate_sweetviz_report(self, df): self.df = df # Store DataFrame for other analyses report = sv.analyze(df) report_path = os.path.join(self.temp_dir, "report.html") report.show_html(report_path, open_browser=False) with open(report_path, 'r', encoding='utf-8') as f: html_content = f.read() html_with_table = f"""
{html_content}
""" os.remove(report_path) return html_with_table def encode_and_visualize(self, column_name, encoder_type='binary'): if self.df is None or column_name not in self.df.columns: return None # Create DataFrame with only the selected column df_subset = self.df[[column_name]].copy() # Select encoder encoders = { 'binary': ce.BinaryEncoder(), 'onehot': ce.OneHotEncoder(), 'catboost': ce.CatBoostEncoder(), 'count': ce.CountEncoder() } encoder = encoders.get(encoder_type) # Encode data encoded_df = encoder.fit_transform(df_subset) # Scale the encoded features scaler = StandardScaler() scaled_data = scaler.fit_transform(encoded_df) # Apply UMAP reducer = umap.UMAP( n_neighbors=15, min_dist=0.1, n_components=2, random_state=42 ) embedding = reducer.fit_transform(scaled_data) # Create visualization plt.figure(figsize=(10, 6)) scatter = plt.scatter( embedding[:, 0], embedding[:, 1], c=pd.factorize(df_subset[column_name])[0], cmap='viridis', alpha=0.6 ) plt.colorbar(scatter) plt.title(f'UMAP visualization of {column_name}\nusing {encoder_type} encoding') plt.xlabel('UMAP1') plt.ylabel('UMAP2') # Save plot to bytes buf = io.BytesIO() plt.savefig(buf, format='png', bbox_inches='tight') plt.close() buf.seek(0) return buf def create_interface(): analyzer = DataAnalyzer() with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# Data Analysis Dashboard") with gr.Tabs(): with gr.TabItem("Sweetviz Analysis"): file_input = gr.File(label="Upload CSV") report_html = gr.HTML() with gr.TabItem("Categorical Analysis"): with gr.Row(): column_dropdown = gr.Dropdown( label="Select Categorical Column", choices=[], interactive=True ) encoder_dropdown = gr.Dropdown( label="Select Encoder", choices=['binary', 'onehot', 'catboost', 'count'], value='binary', interactive=True ) plot_output = gr.Image(label="UMAP Visualization") def process_file(file): if file is None: return None, gr.Dropdown(choices=[]) try: df = pd.read_csv(file.name) # Get categorical columns cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist() return analyzer.generate_sweetviz_report(df), gr.Dropdown(choices=cat_columns) except Exception as e: return f"Error generating report: {str(e)}", gr.Dropdown(choices=[]) def update_plot(column, encoder_type): if column is None: return None try: plot_bytes = analyzer.encode_and_visualize(column, encoder_type) return plot_bytes except Exception as e: return None file_input.change( fn=process_file, inputs=[file_input], outputs=[report_html, column_dropdown] ) column_dropdown.change( fn=update_plot, inputs=[column_dropdown, encoder_dropdown], outputs=[plot_output] ) encoder_dropdown.change( fn=update_plot, inputs=[column_dropdown, encoder_dropdown], outputs=[plot_output] ) return demo if __name__ == "__main__": demo = create_interface() demo.launch(show_error=True)