Spaces:

baconnier
/

csv-plus-plus

Running

App Files Files Community

baconnier commited on Oct 26, 2024

Commit

8c15039

verified ·

1 Parent(s): b2f41cc

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -45

app.py CHANGED Viewed

@@ -45,6 +45,40 @@ class DataAnalyzer:
         os.remove(report_path)
         return html_with_table
     def generate_autoviz_report(self, df):
         if df is None:
             return "Please upload a dataset first"
@@ -55,40 +89,13 @@ class DataAnalyzer:
         os.makedirs(viz_temp_dir)
         try:
-            # Data preprocessing
-            df = df.copy()
-            # Handle datetime columns
-            for col in df.columns:
-                try:
-                    df[col] = pd.to_datetime(df[col], errors='ignore')
-                except:
-                    pass
-            datetime_columns = df.select_dtypes(include=['datetime64']).columns
-            for col in datetime_columns:
-                df[f'{col}_year'] = df[col].dt.year
-                df[f'{col}_month'] = df[col].dt.month
-                df = df.drop(columns=[col])
-            # Try to convert string columns to numeric where possible
-            for col in df.select_dtypes(include=['object']).columns:
-                try:
-                    df[col] = pd.to_numeric(df[col], errors='ignore')
-                except:
-                    pass
-            # Convert remaining string columns to categorical if cardinality is low
-            object_columns = df.select_dtypes(include=['object']).columns
-            for col in object_columns:
-                if df[col].nunique() < 50:
-                    df[col] = df[col].astype('category')
-            # Sample data if needed
             if len(df) > 5000:
                 df = df.sample(n=5000, random_state=42)
-            # Print data info for debugging
             print("\nDataset Info:")
             print(df.info())
             print("\nColumn Types:")
@@ -96,16 +103,16 @@ class DataAnalyzer:
             plt.close('all')
-            # Run AutoViz
             dfte = self.AV.AutoViz(
                 filename='',
                 sep=',',
-                depVar='',
                 dfte=df,
                 header=0,
                 verbose=1,
                 lowess=False,
-                chart_format='svg',
                 max_rows_analyzed=5000,
                 max_cols_analyzed=30,
                 save_plot_dir=viz_temp_dir
@@ -115,7 +122,7 @@ class DataAnalyzer:
             html_parts = []
             if os.path.exists(viz_temp_dir):
                 for file in sorted(os.listdir(viz_temp_dir)):
-                    if file.endswith('.html') or file.endswith('.svg'):
                         file_path = os.path.join(viz_temp_dir, file)
                         try:
                             with open(file_path, 'r', encoding='utf-8') as f:
@@ -125,22 +132,23 @@ class DataAnalyzer:
                         except Exception as e:
                             print(f"Error reading file {file}: {str(e)}")
             if not html_parts:
                 return f"""
                 <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
                     <h3>Data Summary</h3>
                     <p>Total Rows: {len(df)}</p>
                     <p>Total Columns: {len(df.columns)}</p>
-                    <p>Column Types:</p>
-                    <pre>{df.dtypes.to_string()}</pre>
                     <hr>
-                    <h3>No visualizations were generated</h3>
-                    <p>This might be due to:</p>
-                    <ul>
-                        <li>All columns being categorical with high cardinality</li>
-                        <li>No numeric columns for analysis</li>
-                        <li>Data format not suitable for visualization</li>
-                    </ul>
                 </div>
                 """
@@ -151,8 +159,10 @@ class DataAnalyzer:
                     <h3>Dataset Summary</h3>
                     <p>Rows analyzed: {len(df)}</p>
                     <p>Columns: {len(df.columns)}</p>
-                    <p>Column Types:</p>
-                    <pre>{df.dtypes.to_string()}</pre>
                 </div>
                 <hr>
                 {'<hr>'.join(html_parts)}
@@ -197,6 +207,13 @@ def create_interface():
                 with gr.Row():
                     file_input = gr.File(label="Upload CSV")
                 data_preview = gr.Dataframe(label="Data Preview", interactive=False)
                 def load_data(file):
                     if file is None:
@@ -218,6 +235,14 @@ def create_interface():
                 with gr.Row():
                     sweetviz_button = gr.Button("Generate Sweetviz Report")
                 sweetviz_output = gr.HTML(label="Sweetviz Report")
                 def generate_sweetviz(df):
                     if df is None:
@@ -235,6 +260,14 @@ def create_interface():
                 with gr.Row():
                     autoviz_button = gr.Button("Generate AutoViz Report")
                 autoviz_output = gr.HTML(label="AutoViz Report")
                 def generate_autoviz(df):
                     if df is None:

         os.remove(report_path)
         return html_with_table
+    def preprocess_dataframe(self, df):
+        """Preprocess dataframe for visualization"""
+        df = df.copy()
+        # Convert 'value' column to numeric if possible
+        try:
+            # Remove any currency symbols and commas
+            df['value'] = df['value'].replace('[\$,]', '', regex=True)
+            # Convert to float
+            df['value'] = pd.to_numeric(df['value'], errors='coerce')
+        except:
+            pass
+        # Handle datetime columns
+        for col in df.columns:
+            if df[col].dtype == 'object':
+                try:
+                    df[col] = pd.to_datetime(df[col], errors='ignore')
+                except:
+                    pass
+        datetime_columns = df.select_dtypes(include=['datetime64']).columns
+        for col in datetime_columns:
+            df[f'{col}_year'] = df[col].dt.year
+            df[f'{col}_month'] = df[col].dt.month
+            df = df.drop(columns=[col])
+        # Convert categorical columns with low cardinality
+        for col in df.select_dtypes(include=['object']).columns:
+            if df[col].nunique() < 50:
+                df[col] = df[col].astype('category')
+        return df
     def generate_autoviz_report(self, df):
         if df is None:
             return "Please upload a dataset first"
         os.makedirs(viz_temp_dir)
         try:
+            # Preprocess the dataframe
+            df = self.preprocess_dataframe(df)
+            # Sample if needed
             if len(df) > 5000:
                 df = df.sample(n=5000, random_state=42)
             print("\nDataset Info:")
             print(df.info())
             print("\nColumn Types:")
             plt.close('all')
+            # Run AutoViz with modified settings
             dfte = self.AV.AutoViz(
                 filename='',
                 sep=',',
+                depVar='value',  # Set value as target variable
                 dfte=df,
                 header=0,
                 verbose=1,
                 lowess=False,
+                chart_format='html',  # Changed back to html
                 max_rows_analyzed=5000,
                 max_cols_analyzed=30,
                 save_plot_dir=viz_temp_dir
             html_parts = []
             if os.path.exists(viz_temp_dir):
                 for file in sorted(os.listdir(viz_temp_dir)):
+                    if file.endswith('.html'):
                         file_path = os.path.join(viz_temp_dir, file)
                         try:
                             with open(file_path, 'r', encoding='utf-8') as f:
                         except Exception as e:
                             print(f"Error reading file {file}: {str(e)}")
+            # Generate summary statistics
+            numeric_summary = df.describe().to_html() if not df.select_dtypes(include=['number']).empty else ""
+            categorical_summary = df.describe(include=['category', 'object']).to_html() if not df.select_dtypes(include=['category', 'object']).empty else ""
             if not html_parts:
                 return f"""
                 <div style="padding: 20px; border: 1px solid #ddd; border-radius: 5px;">
                     <h3>Data Summary</h3>
                     <p>Total Rows: {len(df)}</p>
                     <p>Total Columns: {len(df.columns)}</p>
+                    <h4>Numeric Summary:</h4>
+                    {numeric_summary}
+                    <h4>Categorical Summary:</h4>
+                    {categorical_summary}
                     <hr>
+                    <h3>Column Types:</h3>
+                    <pre>{df.dtypes.to_string()}</pre>
                 </div>
                 """
                     <h3>Dataset Summary</h3>
                     <p>Rows analyzed: {len(df)}</p>
                     <p>Columns: {len(df.columns)}</p>
+                    <h4>Numeric Summary:</h4>
+                    {numeric_summary}
+                    <h4>Categorical Summary:</h4>
+                    {categorical_summary}
                 </div>
                 <hr>
                 {'<hr>'.join(html_parts)}
                 with gr.Row():
                     file_input = gr.File(label="Upload CSV")
                 data_preview = gr.Dataframe(label="Data Preview", interactive=False)
+                with gr.Row():
+                    gr.Markdown("""
+                    ### Data Preview Info
+                    - Upload a CSV file to begin analysis
+                    - First few rows will be shown here
+                    - Data types and basic statistics will be displayed
+                    """)
                 def load_data(file):
                     if file is None:
                 with gr.Row():
                     sweetviz_button = gr.Button("Generate Sweetviz Report")
                 sweetviz_output = gr.HTML(label="Sweetviz Report")
+                with gr.Row():
+                    gr.Markdown("""
+                    ### Sweetviz Analysis Info
+                    - Comprehensive data profiling
+                    - Statistical analysis
+                    - Feature correlations
+                    - Missing value analysis
+                    """)
                 def generate_sweetviz(df):
                     if df is None:
                 with gr.Row():
                     autoviz_button = gr.Button("Generate AutoViz Report")
                 autoviz_output = gr.HTML(label="AutoViz Report")
+                with gr.Row():
+                    gr.Markdown("""
+                    ### AutoViz Analysis Info
+                    - Automated visualization generation
+                    - Distribution analysis
+                    - Correlation plots
+                    - Feature relationships
+                    """)
                 def generate_autoviz(df):
                     if df is None: