Spaces:

reab5555
/

AI-Data-Cleaner

Sleeping

App Files Files Community

reab5555 commited on Sep 12, 2024

Commit

1853d90

verified ·

1 Parent(s): 654bb84

Upload 5 files

Browse files

Files changed (5) hide show

app.py +93 -0
clean.py +295 -0
llm_prompts.py +123 -0
manage_schema.py +51 -0
report.py +208 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import gradio as gr
+import pandas as pd
+from clean import clean_data
+from report import create_full_report, REPORT_DIR
+import os
+import tempfile
+def clean_and_visualize(file, progress=gr.Progress()):
+    # Load the data
+    df = pd.read_csv(file.name)
+    # Clean the data
+    cleaned_df = None
+    nonconforming_cells_before = None
+    process_times = None
+    removed_columns = None
+    removed_rows = None
+    for progress_value, status_text in clean_data(df):
+        if isinstance(status_text, tuple):
+            cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows = status_text
+            progress(progress_value, desc="Cleaning completed")
+        else:
+            progress(progress_value, desc=status_text)
+    # Generate full visualization report
+    create_full_report(
+        df,
+        cleaned_df,
+        nonconforming_cells_before,
+        process_times,
+        removed_columns,
+        removed_rows
+    )
+    # Save cleaned DataFrame to a temporary CSV file
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp_file:
+        cleaned_df.to_csv(tmp_file.name, index=False)
+        cleaned_csv_path = tmp_file.name
+    # Collect all generated images
+    image_files = [os.path.join(REPORT_DIR, f) for f in os.listdir(REPORT_DIR) if f.endswith('.png')]
+    return cleaned_csv_path, image_files
+def launch_app():
+    with gr.Blocks() as app:
+        gr.Markdown("# Data Cleaning and Visualization App")
+        with gr.Row():
+            file_input = gr.File(label="Upload CSV File")
+        with gr.Row():
+            clean_button = gr.Button("Start Cleaning")
+        with gr.Row():
+            progress_bar = gr.Progress()
+        with gr.Row():
+            download_button = gr.Button("Download Cleaned CSV", visible=False)
+            cleaned_file_output = gr.File(label="Cleaned CSV", visible=False)
+        with gr.Row():
+            output_gallery = gr.Gallery(label="Visualization Results", show_label=True, elem_id="gallery", columns=[2],
+                                        rows=[2], object_fit="contain", height="auto")
+        def process_and_show_download(file):
+            cleaned_csv_path, image_files = clean_and_visualize(file, progress=progress_bar)
+            return (
+                gr.Button.update(visible=True),
+                gr.File.update(value=cleaned_csv_path, visible=True),
+                image_files
+            )
+        clean_button.click(
+            fn=process_and_show_download,
+            inputs=file_input,
+            outputs=[download_button, cleaned_file_output, output_gallery]
+        )
+        def trigger_download():
+            return gr.File.update(visible=True)
+        download_button.click(
+            fn=trigger_download,
+            inputs=[],
+            outputs=[cleaned_file_output]
+        )
+    app.launch()
+if __name__ == "__main__":
+    launch_app()

clean.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import pandas as pd
+import numpy as np
+import json
+import time
+from tqdm import tqdm
+from llm_config import generate_llm_response
+from llm_prompts import (
+    CHECK_HEADERS_PROMPT,
+    NORMALIZE_HEADERS_PROMPT,
+    CHECK_COLUMN_CONTENT_PROMPT,
+    CHECK_TYPOS_PROMPT,
+    TRANSFORM_STRING_PROMPT,
+    CHECK_LOW_COUNT_VALUES_PROMPT
+)
+BATCH_SIZE = 50
+EMPTY_THRESHOLD = 0.5
+def print_dataframe_info(df, step=""):
+    num_columns = df.shape[1]
+    num_rows = df.shape[0]
+    num_cells = num_columns * num_rows
+    print(f"{step}Dataframe info:")
+    print(f"  Number of columns: {num_columns}")
+    print(f"  Number of rows: {num_rows}")
+    print(f"  Total number of cells: {num_cells}")
+def check_and_normalize_column_headers(df):
+    print("Checking and normalizing column headers...")
+    check_prompt = CHECK_HEADERS_PROMPT.format(columns=df.columns.tolist())
+    check_response = generate_llm_response(check_prompt)
+    try:
+        invalid_columns = json.loads(check_response)
+        if invalid_columns:
+            print(f"Columns with invalid names (indices): {invalid_columns}")
+            for idx in invalid_columns:
+                new_name = f"column_{idx}"
+                print(f"Renaming column at index {idx} to '{new_name}'")
+                df.rename(columns={df.columns[idx]: new_name}, inplace=True)
+        else:
+            print("All column headers are valid or no invalid headers detected.")
+    except json.JSONDecodeError:
+        print("Error parsing LLM response for column headers check.")
+    normalize_prompt = NORMALIZE_HEADERS_PROMPT.format(columns=df.columns.tolist())
+    normalize_response = generate_llm_response(normalize_prompt)
+    try:
+        normalized_names = json.loads(normalize_response)
+        if normalized_names:
+            df.rename(columns=normalized_names, inplace=True)
+            print("Column names have been normalized.")
+        else:
+            print("No column names were normalized. Proceeding with current names.")
+    except json.JSONDecodeError:
+        print("Error parsing LLM response for column name normalization.")
+    # Fallback normalization
+    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
+    print("Applied fallback normalization to ensure valid column names.")
+    return df
+def process_column_batch(column_data, column_name):
+    sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
+    prompt = CHECK_COLUMN_CONTENT_PROMPT.format(column_name=column_name, sample_values=str(sample))
+    response = generate_llm_response(prompt)
+    try:
+        result = json.loads(response)
+        if not all(key in result for key in ['data_type', 'empty_indices', 'invalid_indices']):
+            raise ValueError("Missing required keys in LLM response")
+        return result
+    except (json.JSONDecodeError, ValueError) as e:
+        print(f"Error parsing LLM response for column {column_name}: {str(e)}")
+        print(f"LLM Response: {response}")
+        return {'data_type': 'string', 'empty_indices': [], 'invalid_indices': []}
+def check_typos(column_data, column_name):
+    sample = column_data.sample(n=min(BATCH_SIZE, len(column_data)), random_state=42).tolist()
+    prompt = CHECK_TYPOS_PROMPT.format(column_name=column_name, sample_values=str(sample))
+    response = generate_llm_response(prompt)
+    try:
+        return json.loads(response)
+    except json.JSONDecodeError:
+        print(f"Error parsing LLM response for typo check in column {column_name}")
+        return {"typos": {}}
+def transform_string_column(column_data, column_name):
+    unique_values = column_data.unique().tolist()
+    prompt = TRANSFORM_STRING_PROMPT.format(column_name=column_name, unique_values=unique_values)
+    response = generate_llm_response(prompt)
+    try:
+        result = json.loads(response)
+        return result
+    except json.JSONDecodeError:
+        print(f"Error parsing LLM response for string transformation in column {column_name}")
+        return {}
+def check_low_count_values(column_data, column_name):
+    value_counts = column_data.value_counts().to_dict()
+    prompt = CHECK_LOW_COUNT_VALUES_PROMPT.format(column_name=column_name, value_counts=value_counts)
+    response = generate_llm_response(prompt)
+    try:
+        result = json.loads(response)
+        return result
+    except json.JSONDecodeError:
+        print(f"Error parsing LLM response for low count values in column {column_name}")
+        return []
+def remove_empty_columns(df, threshold=EMPTY_THRESHOLD):
+    print(f"Removing columns with less than {threshold * 100}% valid data...")
+    valid_threshold = int(df.shape[0] * threshold)
+    df = df.dropna(axis=1, thresh=valid_threshold)
+    return df
+def remove_empty_rows(df, threshold=EMPTY_THRESHOLD):
+    print(f"Removing rows with less than {threshold * 100}% valid data...")
+    valid_threshold = int(df.shape[1] * threshold)
+    df = df.dropna(axis=0, thresh=valid_threshold)
+    return df
+def remove_low_count_categories(df):
+    print("Removing strings with count below 2...")
+    for col in df.select_dtypes(include=['object']).columns:
+        value_counts = df[col].value_counts()
+        to_remove = value_counts[value_counts < 2].index
+        df[col] = df[col].replace(to_remove, np.nan)
+    return df
+def clean_column(df, column_name):
+    print(f"Cleaning column: {column_name}")
+    column_data = df[column_name]
+    total_rows = len(column_data)
+    empty_indices = []
+    invalid_indices = []
+    data_type = "string"
+    nonconforming_cells = 0
+    for i in range(0, total_rows, BATCH_SIZE):
+        batch = column_data.iloc[i:i + BATCH_SIZE]
+        result = process_column_batch(batch, column_name)
+        valid_empty_indices = [idx for idx in result["empty_indices"] if idx + i < total_rows]
+        valid_invalid_indices = [idx for idx in result["invalid_indices"] if idx + i < total_rows]
+        empty_indices.extend([idx + i for idx in valid_empty_indices])
+        invalid_indices.extend([idx + i for idx in valid_invalid_indices])
+        if i == 0:  # Use the data type from the first batch
+            data_type = result["data_type"]
+    print(f"  Data type determined: {data_type}")
+    print(f"  Empty cells: {len(empty_indices)}")
+    print(f"  Invalid cells: {len(invalid_indices)}")
+    # Convert column to determined data type
+    if data_type == "float":
+        df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce')
+    elif data_type == "integer":
+        df.loc[:, column_name] = pd.to_numeric(df[column_name], errors='coerce').astype('Int64')
+    elif data_type == "date":
+        df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
+    elif data_type == "string" or data_type == "object":
+        # Transform string values
+        transform_result = transform_string_column(column_data, column_name)
+        df[column_name] = df[column_name].map(transform_result).fillna(df[column_name])
+        # Handle "nan" strings
+        df[column_name] = df[column_name].replace({"nan": np.nan, "NaN": np.nan, "NAN": np.nan})
+        # Check for low count values
+        low_count_values = check_low_count_values(df[column_name], column_name)
+        df.loc[df[column_name].isin(low_count_values), column_name] = np.nan
+        # Check for typos
+        typo_result = check_typos(df[column_name], column_name)
+        if typo_result["typos"]:
+            print(f"  Potential typos found: {typo_result['typos']}")
+    # Set empty and invalid cells to NaN
+    df.loc[empty_indices + invalid_indices, column_name] = np.nan
+    nonconforming_cells = len(empty_indices) + len(invalid_indices)
+    return df, nonconforming_cells
+def remove_outliers(df):
+    print("Removing rows with outliers from numeric/integer/float columns...")
+    rows_to_remove = set()
+    for column in df.select_dtypes(include=[np.number]).columns:
+        q1 = df[column].quantile(0.25)
+        q3 = df[column].quantile(0.75)
+        iqr = q3 - q1
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        outlier_rows = df[(df[column] < lower_bound) | (df[column] > upper_bound)].index
+        rows_to_remove.update(outlier_rows)
+    initial_rows = len(df)
+    df = df.drop(index=list(rows_to_remove))
+    removed_rows = initial_rows - len(df)
+    print(f"Removed {removed_rows} rows containing outliers.")
+    return df, removed_rows
+def calculate_nonconforming_cells(df):
+    nonconforming_cells = {}
+    for column in df.columns:
+        # Count NaN values
+        nan_count = df[column].isna().sum()
+        # For numeric columns, count infinite values
+        if np.issubdtype(df[column].dtype, np.number):
+            inf_count = np.isinf(df[column]).sum()
+        else:
+            inf_count = 0
+        # For object columns, count empty strings
+        if df[column].dtype == 'object':
+            empty_string_count = (df[column] == '').sum()
+        else:
+            empty_string_count = 0
+        nonconforming_cells[column] = nan_count + inf_count + empty_string_count
+    return nonconforming_cells
+def clean_data(df):
+    start_time = time.time()
+    process_times = {}
+    removed_rows = 0
+    removed_columns = 0
+    print("Starting data validation and cleaning...")
+    print_dataframe_info(df, "Initial - ")
+    # Calculate nonconforming cells before cleaning
+    nonconforming_cells_before = calculate_nonconforming_cells(df)
+    steps = ['Normalize headers', 'Remove empty columns', 'Remove empty rows', 'Remove low count strings', 'Clean columns', 'Remove outliers']
+    total_steps = len(steps) + len(df.columns)  # Add column count for individual column cleaning
+    # Step 1: Normalize column headers
+    step_start_time = time.time()
+    df = check_and_normalize_column_headers(df)
+    process_times['Normalize headers'] = time.time() - step_start_time
+    yield 1 / total_steps, "Normalized headers"
+    # Step 2: Remove empty columns (less than 60% valid data)
+    step_start_time = time.time()
+    df = remove_empty_columns(df)
+    process_times['Remove empty columns'] = time.time() - step_start_time
+    yield 2 / total_steps, "Removed empty columns"
+    # Step 3: Remove empty rows (less than 60% valid data)
+    step_start_time = time.time()
+    df = remove_empty_rows(df)
+    process_times['Remove empty rows'] = time.time() - step_start_time
+    yield 3 / total_steps, "Removed empty rows"
+    # Step 4: Remove low count categories
+    step_start_time = time.time()
+    df = remove_low_count_categories(df)
+    process_times['Remove low count strings'] = time.time() - step_start_time
+    yield 4 / total_steps, "Removed low count strings"
+    # Step 5: Clean columns (in batches)
+    column_cleaning_times = {}
+    for i, column in enumerate(df.columns):
+        column_start_time = time.time()
+        df, nonconforming = clean_column(df, column)
+        column_cleaning_times[f"Clean column: {column}"] = time.time() - column_start_time
+        yield (5 + i) / total_steps, f"Cleaning column: {column}"
+    process_times.update(column_cleaning_times)
+    # Step 6: Remove outliers from numeric columns
+    step_start_time = time.time()
+    df, outlier_rows_removed = remove_outliers(df)
+    removed_rows += outlier_rows_removed
+    process_times['Remove outliers'] = time.time() - step_start_time
+    yield 1.0, (df, nonconforming_cells_before, process_times, removed_columns, removed_rows)
+    print("Cleaning process completed.")
+    print_dataframe_info(df, "Final - ")

llm_prompts.py ADDED Viewed

	@@ -0,0 +1,123 @@

+CHECK_HEADERS_PROMPT = """
+Analyze the following DataFrame columns and identify any columns without names or with invalid names.
+Return only a JSON list of column indices (0-based) that need attention, without any explanation.
+Columns: {columns}
+"""
+NORMALIZE_HEADERS_PROMPT = """
+Analyze the following DataFrame column names and normalize them according to these rules:
+1. Convert to lowercase
+2. Replace empty strings or spaces with underscores
+3. Remove any invalid characters (keep only letters, numbers, and underscores)
+Return only a JSON object where keys are the original column names and values are the normalized names, without any explanation.
+Column names: {columns}
+"""
+CHECK_COLUMN_CONTENT_PROMPT = """
+Analyze the following sample of values from the column '{column_name}' and determine:
+1. The most appropriate data type (float, integer, string, or date)
+2. Indices of empty or blank values
+3. Indices of values that don't conform to the determined data type
+Sample values:
+{sample_values}
+Return only a JSON object with the following structure, without any explanation:
+{{
+    "data_type": "detected_type",
+    "empty_indices": [list of indices of empty or blank values],
+    "invalid_indices": [list of indices of values that don't conform to the detected type]
+}}
+"""
+CHECK_TYPOS_PROMPT = """
+Analyze the following sample of values from the column '{column_name}' and identify any potential typos or misspellings.
+For each identified typo, suggest a correction.
+Sample values:
+{sample_values}
+Return only a JSON object with the following structure, without any explanation:
+{{
+    "typos": {{
+        "original_value1": "corrected_value1",
+        "original_value2": "corrected_value2",
+        ...
+    }}
+}}
+If no typos are found, return an empty object for "typos".
+"""
+ENCODE_STRING_PROMPT = """
+Analyze the following unique values from the column '{column_name}' and create an encoding scheme.
+Assign a unique integer to each unique string value, starting from 0.
+Unique values:
+{unique_values}
+Return only a JSON object with the following structure, without any explanation:
+{{
+    "string_value1": 0,
+    "string_value2": 1,
+    "string_value3": 2,
+    ...
+}}
+Ensure that each unique string value is assigned a unique integer.
+"""
+DETERMINE_DTYPE_PROMPT = """
+Analyze the following sample values from a column and determine the most appropriate data type.
+Possible types are: float, integer, string, or date.
+If more than 80% of the values conform to a specific type, choose that type.
+Otherwise, default to string.
+Sample values:
+{sample_values}
+Return only a JSON object with the following structure, without any explanation:
+{{
+    "column_type": "detected_type",
+    "invalid_indices": [list of indices that do not conform to the detected type]
+}}
+"""
+TRANSFORM_STRING_PROMPT = """
+Transform the following unique string values from the column '{column_name}' to lowercase.
+If a value is a variation of "nan" (case-insensitive), map it to "nan".
+Unique values:
+{unique_values}
+Return only a JSON object with the following structure, without any explanation:
+{{
+    "original_value1": "transformed_value1",
+    "original_value2": "transformed_value2",
+    ...
+}}
+"""
+CHECK_LOW_COUNT_VALUES_PROMPT = """
+Analyze the following value counts from the column '{column_name}' and identify values with a count lower than 2.
+Value counts:
+{value_counts}
+Return only a JSON list of values that have a count lower than 2, without any explanation.
+"""
+CHECK_SCHEMA_CONFORMITY_PROMPT = """
+Analyze the following sample of values from the column '{column_name}' and check if they conform to the determined data type '{data_type}'.
+Sample values:
+{sample_values}
+Return only a JSON object with the following structure, without any explanation:
+{{
+    "conforming_indices": [list of indices of values that conform to the data type],
+    "nonconforming_indices": [list of indices of values that do not conform to the data type]
+}}
+"""

manage_schema.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import pandas as pd
+import numpy as np
+import json
+from llm_config import generate_llm_response
+from llm_prompts import DETERMINE_DTYPE_PROMPT
+SAMPLE_SIZE = 200
+def determine_column_type(df, column):
+    sample = df[column].sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).tolist()
+    prompt = DETERMINE_DTYPE_PROMPT.format(sample_values=str(sample))
+    response = generate_llm_response(prompt)
+    try:
+        result = json.loads(response)
+        return result['column_type'], result['invalid_indices']
+    except (json.JSONDecodeError, KeyError):
+        print(f"Error parsing LLM response for column {column}")
+        return 'string', []
+def enforce_column_type(df, column, column_type, invalid_indices):
+    if column_type == 'float':
+        df[column] = pd.to_numeric(df[column], errors='coerce')
+    elif column_type == 'integer':
+        df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')
+    elif column_type == 'date':
+        df[column] = pd.to_datetime(df[column], errors='coerce')
+    # Set invalid values to NaN
+    df.loc[invalid_indices, column] = np.nan
+    return df
+def process_dataframe(df):
+    print("Determining and enforcing column data types...")
+    for column in df.columns:
+        print(f"\nProcessing column: {column}")
+        column_type, invalid_indices = determine_column_type(df, column)
+        print(f"  Detected type: {column_type}")
+        print(f"  Number of invalid values: {len(invalid_indices)}")
+        df = enforce_column_type(df, column, column_type, invalid_indices)
+        valid_percentage = (df[column].count() / len(df)) * 100
+        print(f"  Percentage of valid values after type enforcement: {valid_percentage:.2f}%")
+    return df

report.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import os
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from datetime import datetime
+REPORT_DIR = f"cleaning_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+os.makedirs(REPORT_DIR, exist_ok=True)
+def save_plot(fig, filename):
+    fig.savefig(os.path.join(REPORT_DIR, filename), dpi=400, bbox_inches='tight')
+    plt.close(fig)
+def plot_heatmap(df, title):
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
+    plt.title(title)
+    plt.tight_layout()
+    save_plot(plt.gcf(), f'{title.lower().replace(" ", "_")}.png')
+def plot_valid_data_percentage(original_df, cleaned_df):
+    original_valid = (original_df.notna().sum() / len(original_df)) * 100
+    cleaned_valid = (cleaned_df.notna().sum() / len(cleaned_df)) * 100
+    # Create a DataFrame with both original and cleaned percentages
+    combined_df = pd.DataFrame({
+        'Original': original_valid,
+        'Cleaned': cleaned_valid
+    }).fillna(0)  # Fill NaN with 0 for columns that were removed
+    plt.figure(figsize=(15, 8))
+    combined_df.plot(kind='bar', width=0.8, alpha=0.8)
+    plt.xlabel('Columns')
+    plt.ylabel('Percentage of Valid Data')
+    plt.title('Percentage of Valid Data Before and After Cleaning')
+    plt.xticks(rotation=90)
+    plt.legend(['Before Cleaning', 'After Cleaning'])
+    # Add percentage labels on the bars
+    for i, (index, row) in enumerate(combined_df.iterrows()):
+        plt.text(i, row['Original'], f'{row["Original"]:.1f}%', ha='center', va='bottom')
+        if row['Cleaned'] > 0:  # Only add label if column exists in cleaned data
+            plt.text(i, row['Cleaned'], f'{row["Cleaned"]:.1f}%', ha='center', va='bottom')
+    plt.tight_layout()
+    plt.savefig(os.path.join(REPORT_DIR, 'valid_data_percentage.png'))
+    plt.close()
+def plot_column_schemas(df):
+    schemas = df.dtypes.astype(str).value_counts()
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.barplot(x=schemas.index, y=schemas.values, ax=ax)
+    ax.set_title('Column Data Types')
+    ax.set_xlabel('Data Type')
+    ax.set_ylabel('Count')
+    save_plot(fig, 'column_schemas.png')
+def plot_nonconforming_cells(nonconforming_cells):
+    # Ensure that nonconforming_cells is a dictionary
+    if isinstance(nonconforming_cells, dict):
+        # Proceed with plotting if it's a dictionary
+        fig, ax = plt.subplots(figsize=(12, 6))
+        sns.barplot(x=list(nonconforming_cells.keys()), y=list(nonconforming_cells.values()), ax=ax)
+        ax.set_title('Nonconforming Cells by Column')
+        ax.set_xlabel('Columns')
+        ax.set_ylabel('Number of Nonconforming Cells')
+        plt.xticks(rotation=90)
+        save_plot(fig, 'nonconforming_cells.png')
+    else:
+        print(f"Expected nonconforming_cells to be a dictionary, but got {type(nonconforming_cells)}.")
+def plot_column_distributions(original_df, cleaned_df):
+    numeric_columns = original_df.select_dtypes(include=[np.number]).columns
+    num_columns = len(numeric_columns)
+    if num_columns == 0:
+        print("No numeric columns found for distribution plots.")
+        return
+    # Create subplots for distributions
+    fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(18, 5 * ((num_columns + 2) // 3)))
+    axes = axes.flatten() if num_columns > 1 else [axes]
+    for i, column in enumerate(numeric_columns):
+        if column in cleaned_df.columns:
+            sns.histplot(original_df[column].dropna(), ax=axes[i], kde=True, color='blue', label='Before Cleaning', alpha=0.5)
+            sns.histplot(cleaned_df[column].dropna(), ax=axes[i], kde=True, color='orange', label='After Cleaning', alpha=0.5)
+            axes[i].set_title(f'{column} - Distribution Before & After Cleaning')
+            axes[i].legend()
+    # Remove any unused subplots
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+    plt.tight_layout()
+    save_plot(fig, 'distributions_before_after_cleaning.png')
+def plot_boxplot_with_outliers(df):
+    print("Plotting boxplots with outliers...")
+    numeric_columns = df.select_dtypes(include=[np.number]).columns
+    num_columns = len(numeric_columns)
+    if num_columns == 0:
+        print("No numeric columns found for boxplot.")
+        return
+    # Create subplots based on the number of numeric columns
+    fig, axes = plt.subplots(nrows=(num_columns + 2) // 3, ncols=3, figsize=(15, 5 * ((num_columns + 2) // 3)))
+    axes = axes.flatten() if num_columns > 1 else [axes]
+    for i, column in enumerate(numeric_columns):
+        sns.boxplot(x=df[column], ax=axes[i])
+        axes[i].set_title(f'Boxplot of {column} with Outliers')
+    # Remove any unused subplots
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+    plt.tight_layout()
+    save_plot(fig, 'boxplots_with_outliers.png')
+def plot_correlation_heatmap(df):
+    # Select only numeric, float, and integer columns
+    numeric_df = df.select_dtypes(include=[np.number])
+    # Compute the correlation matrix
+    correlation_matrix = numeric_df.corr()
+    # Plot the heatmap
+    fig, ax = plt.subplots(figsize=(15, 10))
+    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', ax=ax, cbar_kws={'label': 'Correlation'})
+    ax.set_title('Correlation Heatmap')
+    save_plot(fig, 'correlation_heatmap.png')
+def plot_process_times(process_times):
+    # Convert seconds to minutes
+    process_times_minutes = {k: v / 60 for k, v in process_times.items()}
+    # Separate main processes and column cleaning processes
+    main_processes = {k: v for k, v in process_times_minutes.items() if not k.startswith("Clean column:")}
+    column_processes = {k: v for k, v in process_times_minutes.items() if k.startswith("Clean column:")}
+    # Create the plot
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
+    # Plot main processes
+    bars1 = ax1.bar(main_processes.keys(), main_processes.values())
+    ax1.set_title('Main Process Times')
+    ax1.set_ylabel('Time (minutes)')
+    ax1.tick_params(axis='x', rotation=45)
+    # Plot column cleaning processes
+    bars2 = ax2.bar(column_processes.keys(), column_processes.values())
+    ax2.set_title('Column Cleaning Times')
+    ax2.set_ylabel('Time (minutes)')
+    ax2.tick_params(axis='x', rotation=90)
+    # Add value labels on top of each bar
+    for ax, bars in zip([ax1, ax2], [bars1, bars2]):
+        for bar in bars:
+            height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width() / 2., height,
+                    f'{height:.2f}', ha='center', va='bottom')
+    # Add total time to the plot
+    total_time = sum(process_times_minutes.values())
+    fig.suptitle(f'Process Times (Total: {total_time:.2f} minutes)', fontsize=16)
+    plt.tight_layout()
+    save_plot(fig, 'process_times.png')
+def create_full_report(original_df, cleaned_df, nonconforming_cells_before, process_times, removed_columns, removed_rows):
+    os.makedirs(REPORT_DIR, exist_ok=True)
+    sns.set_style("whitegrid")
+    plt.rcParams['figure.dpi'] = 400
+    print("Plotting valid data percentages...")
+    plot_valid_data_percentage(original_df, cleaned_df)
+    print("Plotting column schemas...")
+    plot_column_schemas(cleaned_df)
+    print("Plotting nonconforming cells before cleaning...")
+    plot_nonconforming_cells(nonconforming_cells_before)
+    print("Plotting column distributions...")
+    plot_column_distributions(original_df, cleaned_df)
+    print("Plotting process times...")
+    plot_process_times(process_times)
+    print("Plotting heatmaps...")
+    plot_heatmap(original_df, "Missing Values Before Cleaning")
+    print("Plotting correlation heatmap...")
+    plot_correlation_heatmap(cleaned_df)
+    print(f"All visualization reports saved in directory: {REPORT_DIR}")