Spaces:

baconnier
/

csv-plus-plus

Sleeping

App Files Files Community

baconnier commited on Oct 26, 2024

Commit

96823ba

•

1 Parent(s): f421e23

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -108

app.py CHANGED Viewed

@@ -12,6 +12,9 @@ import shutil
 import warnings
 import io
 import base64
 warnings.filterwarnings('ignore')
 class DataAnalyzer:
@@ -19,7 +22,55 @@ class DataAnalyzer:
         self.temp_dir = tempfile.mkdtemp()
         self.df = None
         self.AV = AutoViz_Class()
     def generate_sweetviz_report(self, df):
         if df is None:
             return "Please upload a dataset first"
@@ -47,71 +98,25 @@ class DataAnalyzer:
         os.remove(report_path)
         return html_with_table
-    def preprocess_dataframe(self, df):
-        df = df.copy()
-        # Convert 'value' column to numeric if possible
-        if 'value' in df.columns:
-            df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce')
-        # Handle datetime columns
-        for col in df.columns:
-            if df[col].dtype == 'object':
-                try:
-                    df[col] = pd.to_datetime(df[col], errors='ignore')
-                except:
-                    pass
-        # Convert categorical columns with low cardinality
-        for col in df.select_dtypes(include=['object']).columns:
-            if df[col].nunique() < 50:
-                df[col] = df[col].astype('category')
-        return df
     def generate_autoviz_report(self, df):
         if df is None:
             return "Please upload a dataset first"
-        viz_temp_dir = os.path.join(self.temp_dir, "autoviz_output")
-        if os.path.exists(viz_temp_dir):
-            shutil.rmtree(viz_temp_dir)
-        os.makedirs(viz_temp_dir)
         try:
             # Preprocess the dataframe
-            df = self.preprocess_dataframe(df)
             # Sample if needed
             if len(df) > 5000:
                 df = df.sample(n=5000, random_state=42)
-            print("\nDataset Info:")
-            print(df.info())
-            print("\nColumn Types:")
-            print(df.dtypes)
-            plt.close('all')
-            # Create a directory for plots
-            plots_dir = os.path.join(viz_temp_dir, "plots")
-            os.makedirs(plots_dir, exist_ok=True)
-            # Run AutoViz
-            dfte = self.AV.AutoViz(
-                filename='',
-                sep=',',
-                depVar='value',  # Set value as target variable
-                dfte=df,
-                header=0,
-                verbose=1,
-                lowess=False,
-                chart_format='html',
-                max_rows_analyzed=5000,
-                max_cols_analyzed=30,
-                save_plot_dir=plots_dir
-            )
             # Generate summary statistics
             numeric_cols = df.select_dtypes(include=['number']).columns
             categorical_cols = df.select_dtypes(include=['category', 'object']).columns
@@ -138,20 +143,20 @@ class DataAnalyzer:
                 .table th {
                     background-color: #f8f9fa;
                 }
-                pre {
-                    background-color: #f8f9fa;
-                    padding: 1rem;
-                    border-radius: 4px;
-                }
-                .viz-container {
                     margin: 20px 0;
-                    padding: 20px;
                     border: 1px solid #ddd;
                     border-radius: 5px;
                 }
             </style>
             """
             html_content += f"""
             <div class="viz-container">
                 <h2 style="text-align: center;">Data Analysis Report</h2>
@@ -170,26 +175,18 @@ class DataAnalyzer:
                     <div style="overflow-x: auto;">
                         {categorical_stats.to_html(classes='table table-striped')}
                     </div>
-                    <h3>Column Types</h3>
-                    <pre>{df.dtypes.to_string()}</pre>
                 </div>
             """
-            # Add plots if they exist
-            if os.path.exists(plots_dir):
-                for file in sorted(os.listdir(plots_dir)):
-                    if file.endswith('.html'):
-                        with open(os.path.join(plots_dir, file), 'r', encoding='utf-8') as f:
-                            plot_content = f.read()
-                            if plot_content.strip():
-                                html_content += f"""
-                                <div class="viz-container">
-                                    <h3>{file.replace('.html', '').replace('_', ' ').title()}</h3>
-                                    {plot_content}
-                                </div>
-                                """
             html_content += "</div>"
             return html_content
@@ -197,24 +194,13 @@ class DataAnalyzer:
             import traceback
             error_message = f"""
             <div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
-                <h3>Error in AutoViz Analysis</h3>
                 <p>Error details: {str(e)}</p>
                 <p>Stack trace:</p>
                 <pre>{traceback.format_exc()}</pre>
-                <p>Dataset Info:</p>
-                <pre>
-                Rows: {len(df)}
-                Columns: {len(df.columns)}
-                Types:\n{df.dtypes.to_string()}
-                </pre>
             </div>
             """
             return error_message
-        finally:
-            if os.path.exists(viz_temp_dir):
-                shutil.rmtree(viz_temp_dir)
 def create_interface():
     analyzer = DataAnalyzer()
@@ -314,42 +300,41 @@ def create_interface():
                     outputs=[sweetviz_output]
                 )
-            # Third Tab: AutoViz Analysis
-            with gr.TabItem("AutoViz Analysis"):
                 with gr.Row():
                     with gr.Column(scale=2):
-                        autoviz_button = gr.Button(
-                            "Generate AutoViz Report",
                             variant="primary"
                         )
                     with gr.Column(scale=1):
                         gr.Markdown("""
-                        ### AutoViz Analysis Features
-                        - Automated visualization generation
-                        - Distribution analysis
-                        - Correlation plots
-                        - Feature relationships
-                        - Time series analysis (if applicable)
                         """)
                 with gr.Row():
-                    autoviz_output = gr.HTML(
-                        label="AutoViz Report",
-                        value="Click the button above to generate the report"
                     )
-                def generate_autoviz(df):
                     if df is None:
                         return "Please upload a dataset first"
                     try:
                         return analyzer.generate_autoviz_report(df)
                     except Exception as e:
-                        return f"Error generating AutoViz report: {str(e)}"
-                autoviz_button.click(
-                    fn=generate_autoviz,
                     inputs=[current_df],
-                    outputs=[autoviz_output]
                 )
     return demo

 import warnings
 import io
 import base64
+from pathlib import Path
+import matplotlib
+matplotlib.use('Agg')
 warnings.filterwarnings('ignore')
 class DataAnalyzer:
         self.temp_dir = tempfile.mkdtemp()
         self.df = None
         self.AV = AutoViz_Class()
+        self.plots_memory = {}  # Store plots in memory
+    def save_plot_to_memory(self, fig, plot_name):
+        """Save matplotlib figure to memory as base64"""
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight')
+        buf.seek(0)
+        img_str = base64.b64encode(buf.getvalue()).decode()
+        self.plots_memory[plot_name] = f'data:image/png;base64,{img_str}'
+        plt.close(fig)
+    def generate_basic_plots(self, df):
+        """Generate basic matplotlib plots"""
+        # Numeric columns distribution
+        numeric_cols = df.select_dtypes(include=['number']).columns
+        for col in numeric_cols:
+            fig, ax = plt.subplots(figsize=(10, 6))
+            df[col].hist(bins=30, ax=ax)
+            ax.set_title(f'Distribution of {col}')
+            self.save_plot_to_memory(fig, f'dist_{col}')
+            # Box plot
+            fig, ax = plt.subplots(figsize=(10, 6))
+            df.boxplot(column=col, ax=ax)
+            ax.set_title(f'Box Plot of {col}')
+            self.save_plot_to_memory(fig, f'box_{col}')
+        # Categorical columns
+        categorical_cols = df.select_dtypes(include=['category', 'object']).columns
+        for col in categorical_cols:
+            if df[col].nunique() < 20:  # Only for columns with reasonable number of categories
+                fig, ax = plt.subplots(figsize=(12, 6))
+                df[col].value_counts().plot(kind='bar', ax=ax)
+                ax.set_title(f'Distribution of {col}')
+                plt.xticks(rotation=45)
+                self.save_plot_to_memory(fig, f'cat_{col}')
+        # Correlation matrix for numeric columns
+        if len(numeric_cols) > 1:
+            fig, ax = plt.subplots(figsize=(10, 8))
+            correlation_matrix = df[numeric_cols].corr()
+            im = ax.imshow(correlation_matrix)
+            ax.set_xticks(range(len(numeric_cols)))
+            ax.set_yticks(range(len(numeric_cols)))
+            ax.set_xticklabels(numeric_cols, rotation=45)
+            ax.set_yticklabels(numeric_cols)
+            plt.colorbar(im)
+            ax.set_title('Correlation Matrix')
+            self.save_plot_to_memory(fig, 'correlation_matrix')
     def generate_sweetviz_report(self, df):
         if df is None:
             return "Please upload a dataset first"
         os.remove(report_path)
         return html_with_table
     def generate_autoviz_report(self, df):
         if df is None:
             return "Please upload a dataset first"
         try:
             # Preprocess the dataframe
+            df = df.copy()
+            # Convert 'value' column to numeric if possible
+            if 'value' in df.columns:
+                df['value'] = pd.to_numeric(df['value'].replace('[\$,]', '', regex=True), errors='coerce')
             # Sample if needed
             if len(df) > 5000:
                 df = df.sample(n=5000, random_state=42)
+            # Generate basic plots
+            self.generate_basic_plots(df)
             # Generate summary statistics
             numeric_cols = df.select_dtypes(include=['number']).columns
             categorical_cols = df.select_dtypes(include=['category', 'object']).columns
                 .table th {
                     background-color: #f8f9fa;
                 }
+                .plot-container {
                     margin: 20px 0;
+                    padding: 10px;
                     border: 1px solid #ddd;
                     border-radius: 5px;
                 }
+                .plot-container img {
+                    max-width: 100%;
+                    height: auto;
+                }
             </style>
             """
+            # Add summary statistics
             html_content += f"""
             <div class="viz-container">
                 <h2 style="text-align: center;">Data Analysis Report</h2>
                     <div style="overflow-x: auto;">
                         {categorical_stats.to_html(classes='table table-striped')}
                     </div>
                 </div>
             """
+            # Add plots from memory
+            for plot_name, plot_data in self.plots_memory.items():
+                html_content += f"""
+                <div class="plot-container">
+                    <h3>{plot_name.replace('_', ' ').title()}</h3>
+                    <img src="{plot_data}" alt="{plot_name}">
+                </div>
+                """
             html_content += "</div>"
             return html_content
             import traceback
             error_message = f"""
             <div style="padding: 20px; border: 1px solid red; border-radius: 5px;">
+                <h3>Error in Analysis</h3>
                 <p>Error details: {str(e)}</p>
                 <p>Stack trace:</p>
                 <pre>{traceback.format_exc()}</pre>
             </div>
             """
             return error_message
 def create_interface():
     analyzer = DataAnalyzer()
                     outputs=[sweetviz_output]
                 )
+            # Third Tab: Visual Analysis
+            with gr.TabItem("Visual Analysis"):
                 with gr.Row():
                     with gr.Column(scale=2):
+                        viz_button = gr.Button(
+                            "Generate Visualizations",
                             variant="primary"
                         )
                     with gr.Column(scale=1):
                         gr.Markdown("""
+                        ### Visualization Features
+                        - Distribution plots
+                        - Correlation analysis
+                        - Categorical variable analysis
+                        - Statistical summaries
                         """)
                 with gr.Row():
+                    viz_output = gr.HTML(
+                        label="Visualization Report",
+                        value="Click the button above to generate visualizations"
                     )
+                def generate_viz(df):
                     if df is None:
                         return "Please upload a dataset first"
                     try:
                         return analyzer.generate_autoviz_report(df)
                     except Exception as e:
+                        return f"Error generating visualizations: {str(e)}"
+                viz_button.click(
+                    fn=generate_viz,
                     inputs=[current_df],
+                    outputs=[viz_output]
                 )
     return demo