Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import sweetviz as sv | |
import tempfile | |
import os | |
import category_encoders as ce | |
import umap | |
import matplotlib.pyplot as plt | |
from sklearn.preprocessing import StandardScaler | |
import seaborn as sns | |
import numpy as np | |
import io | |
import base64 | |
class DataAnalyzer: | |
def __init__(self): | |
self.temp_dir = tempfile.mkdtemp() | |
self.df = None | |
def generate_sweetviz_report(self, df): | |
self.df = df # Store DataFrame for other analyses | |
report = sv.analyze(df) | |
report_path = os.path.join(self.temp_dir, "report.html") | |
report.show_html(report_path, open_browser=False) | |
with open(report_path, 'r', encoding='utf-8') as f: | |
html_content = f.read() | |
html_with_table = f""" | |
<table width="100%" style="border-collapse: collapse;"> | |
<tr> | |
<td style="padding: 20px; border: 1px solid #ddd;"> | |
<div style="height: 800px; overflow: auto;"> | |
{html_content} | |
</div> | |
</td> | |
</tr> | |
</table> | |
""" | |
os.remove(report_path) | |
return html_with_table | |
def encode_and_visualize(self, column_name, encoder_type='binary'): | |
if self.df is None or column_name not in self.df.columns: | |
return None | |
# Create DataFrame with only the selected column | |
df_subset = self.df[[column_name]].copy() | |
# Select encoder | |
encoders = { | |
'binary': ce.BinaryEncoder(), | |
'onehot': ce.OneHotEncoder(), | |
'catboost': ce.CatBoostEncoder(), | |
'count': ce.CountEncoder() | |
} | |
encoder = encoders.get(encoder_type) | |
# Encode data | |
encoded_df = encoder.fit_transform(df_subset) | |
# Scale the encoded features | |
scaler = StandardScaler() | |
scaled_data = scaler.fit_transform(encoded_df) | |
# Apply UMAP | |
reducer = umap.UMAP( | |
n_neighbors=15, | |
min_dist=0.1, | |
n_components=2, | |
random_state=42 | |
) | |
embedding = reducer.fit_transform(scaled_data) | |
# Create visualization | |
plt.figure(figsize=(10, 6)) | |
scatter = plt.scatter( | |
embedding[:, 0], | |
embedding[:, 1], | |
c=pd.factorize(df_subset[column_name])[0], | |
cmap='viridis', | |
alpha=0.6 | |
) | |
plt.colorbar(scatter) | |
plt.title(f'UMAP visualization of {column_name}\nusing {encoder_type} encoding') | |
plt.xlabel('UMAP1') | |
plt.ylabel('UMAP2') | |
# Save plot to bytes | |
buf = io.BytesIO() | |
plt.savefig(buf, format='png', bbox_inches='tight') | |
plt.close() | |
buf.seek(0) | |
return buf | |
def create_interface(): | |
analyzer = DataAnalyzer() | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# Data Analysis Dashboard") | |
with gr.Tabs(): | |
with gr.TabItem("Sweetviz Analysis"): | |
file_input = gr.File(label="Upload CSV") | |
report_html = gr.HTML() | |
with gr.TabItem("Categorical Analysis"): | |
with gr.Row(): | |
column_dropdown = gr.Dropdown( | |
label="Select Categorical Column", | |
choices=[], | |
interactive=True | |
) | |
encoder_dropdown = gr.Dropdown( | |
label="Select Encoder", | |
choices=['binary', 'onehot', 'catboost', 'count'], | |
value='binary', | |
interactive=True | |
) | |
plot_output = gr.Image(label="UMAP Visualization") | |
def process_file(file): | |
if file is None: | |
return None, gr.Dropdown(choices=[]) | |
try: | |
df = pd.read_csv(file.name) | |
# Get categorical columns | |
cat_columns = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
return analyzer.generate_sweetviz_report(df), gr.Dropdown(choices=cat_columns) | |
except Exception as e: | |
return f"Error generating report: {str(e)}", gr.Dropdown(choices=[]) | |
def update_plot(column, encoder_type): | |
if column is None: | |
return None | |
try: | |
plot_bytes = analyzer.encode_and_visualize(column, encoder_type) | |
return plot_bytes | |
except Exception as e: | |
return None | |
file_input.change( | |
fn=process_file, | |
inputs=[file_input], | |
outputs=[report_html, column_dropdown] | |
) | |
column_dropdown.change( | |
fn=update_plot, | |
inputs=[column_dropdown, encoder_dropdown], | |
outputs=[plot_output] | |
) | |
encoder_dropdown.change( | |
fn=update_plot, | |
inputs=[column_dropdown, encoder_dropdown], | |
outputs=[plot_output] | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch(show_error=True) |