Spaces:

BananaSauce
/

batch-run-csv-analyser

Sleeping

File size: 9,346 Bytes

import pandas as pd
import streamlit as st
import numpy as np
from pre import preprocess_uploaded_file
from difflib import SequenceMatcher
import time

def similar(a, b, threshold=0.9):
    return SequenceMatcher(None, a, b).ratio() > threshold

def find_different_scenarios(grouped_data, area):
    # Filter data for the specific functional area
    area_data = grouped_data[grouped_data['Functional area'] == area]
    
    # Get scenarios for each environment
    scenarios_by_env = {env: set(area_data[area_data['Environment'] == env]['Scenario name']) 
                        for env in area_data['Environment'].unique()}
    
    # Find scenarios that are in one environment but not the other
    diff_scenarios = []
    envs = list(scenarios_by_env.keys())
    for i in range(len(envs)):
        for j in range(i+1, len(envs)):
            env1, env2 = envs[i], envs[j]
            diff = scenarios_by_env[env1] ^ scenarios_by_env[env2]  # symmetric difference
            for scenario in diff:
                if scenario in scenarios_by_env[env1]:
                    diff_scenarios.append((scenario, env1, 'Present', env2, 'Missing'))
                else:
                    diff_scenarios.append((scenario, env2, 'Present', env1, 'Missing'))
    
    return diff_scenarios

def perform_multi_env_analysis(uploaded_dataframes):
    # Concatenate all dataframes into a single dataframe
    combined_data = pd.concat(uploaded_dataframes, ignore_index=True)

    # Get unique environments and functional areas
    unique_environments = combined_data['Environment'].unique()
    unique_areas = np.append(combined_data['Functional area'].unique(), "All")

    # Select environments to display
    selected_environments = st.multiselect("Select environments to display", unique_environments, default=unique_environments)

    # Initialize session state for selected functional areas if it doesn't exist
    if 'selected_functional_areas' not in st.session_state:
        st.session_state.selected_functional_areas = ["All"]

    # Select functional areas to display, using session state
    selected_functional_areas = st.multiselect(
        "Select functional areas",
        unique_areas,
        default=st.session_state.selected_functional_areas,
        key="functional_areas_multiselect"
    )

    # Add a button to confirm the selection
    if st.button("Confirm Functional Area Selection"):
        # Update session state with the new selection
        st.session_state.selected_functional_areas = selected_functional_areas
        st.success("Functional area selection updated!")
        time.sleep(0.5)  # Add a small delay for better user experience
        st.rerun()  # Rerun the app to reflect the changes

    if "All" in selected_functional_areas:
        selected_functional_areas = combined_data['Functional area'].unique()

    # Filter data based on selected environments and functional areas
    filtered_data = combined_data[
        (combined_data['Environment'].isin(selected_environments)) &
        (combined_data['Functional area'].isin(selected_functional_areas))
    ]

    # Group data by Environment, Functional area, Scenario name, and Status
    grouped_data = filtered_data.groupby(['Environment', 'Functional area', 'Scenario name', 'Status']).size().unstack(fill_value=0)

    # Ensure 'PASSED' and 'FAILED' columns exist
    if 'PASSED' not in grouped_data.columns:
        grouped_data['PASSED'] = 0
    if 'FAILED' not in grouped_data.columns:
        grouped_data['FAILED'] = 0

    # Calculate total scenarios
    grouped_data['Total'] = grouped_data['PASSED'] + grouped_data['FAILED']

    # Reset index to make Environment, Functional area, and Scenario name as columns
    grouped_data = grouped_data.reset_index()

    # Reorder columns
    grouped_data = grouped_data[['Environment', 'Functional area', 'Scenario name', 'Total', 'PASSED', 'FAILED']]

    # Display summary statistics
    st.write("### Summary Statistics")
    summary = grouped_data.groupby('Environment').agg({
        'Total': 'sum',
        'PASSED': 'sum',
        'FAILED': 'sum'
    }).reset_index()

    # Add column names as the first row
    summary_with_headers = pd.concat([pd.DataFrame([summary.columns], columns=summary.columns), summary], ignore_index=True)

    # Display the DataFrame
    st.dataframe(summary_with_headers)
    # Define scenarios_by_env here
    scenarios_by_env = {env: set(grouped_data[grouped_data['Environment'] == env]['Scenario name']) for env in selected_environments}

    missing_scenarios = []
    mismatched_scenarios = []
    
    st.write("### Inconsistent Scenario Count Analysis by Functional Area")
    
    if len(selected_environments) > 1:
        # Group data by Environment and Functional area, count scenarios
        scenario_counts = filtered_data.groupby(['Environment', 'Functional area'])['Scenario name'].nunique().unstack(fill_value=0)
        
        # Calculate the difference between max and min counts for each functional area
        count_diff = scenario_counts.max() - scenario_counts.min()
        
        # Sort functional areas by count difference, descending
        inconsistent_areas = count_diff.sort_values(ascending=False)
        
        st.write("Functional areas with inconsistent scenario counts across environments:")
        for area, diff in inconsistent_areas.items():
            if diff > 0:
                st.write(f"- {area}: Difference of {diff} scenarios")
                st.write(scenario_counts[area])
                st.write("\n")
        
        # Option to show detailed breakdown with a unique key
        if st.checkbox("Show detailed scenario count breakdown", key="show_detailed_breakdown"):
            st.write(scenario_counts)
    
        # Add a selectbox for choosing the functional area to analyze
        selected_area = st.selectbox("Select a functional area to analyze:", 
                                     options=[area for area, diff in inconsistent_areas.items() if diff > 0])
        
        if selected_area:
            st.write(f"### Detailed Analysis of Different Scenarios for '{selected_area}'")
            
            # Get scenarios for each environment
            scenarios_by_env = {env: set(filtered_data[(filtered_data['Environment'] == env) & 
                                                       (filtered_data['Functional area'] == selected_area)]['Scenario name']) 
                                for env in selected_environments}
            
            # Find scenarios that are different between environments
            all_scenarios = set.union(*scenarios_by_env.values())
            diff_scenarios = [scenario for scenario in all_scenarios 
                              if any(scenario not in env_scenarios for env_scenarios in scenarios_by_env.values())]
            
            # Create a DataFrame to show presence/absence of scenarios
            diff_df = pd.DataFrame(index=diff_scenarios, columns=selected_environments)
            for scenario in diff_scenarios:
                for env in selected_environments:
                    diff_df.at[scenario, env] = 'Present' if scenario in scenarios_by_env[env] else 'Missing'
            
            diff_df.reset_index(inplace=True)
            diff_df.rename(columns={'index': 'Scenario'}, inplace=True)
            
            # Sort the DataFrame to show scenarios with differences first
            diff_df['has_diff'] = diff_df.apply(lambda row: len(set(row[1:])) > 1, axis=1)
            diff_df = diff_df.sort_values('has_diff', ascending=False).drop('has_diff', axis=1)
            
            st.write(f"Number of scenarios that differ between environments: {len(diff_scenarios)}")
            
            # Display the DataFrame
            st.dataframe(diff_df)
            
            # Provide a download button for the DataFrame
            csv = diff_df.to_csv(index=False)
            st.download_button(
                label="Download CSV",
                data=csv,
                file_name=f"{selected_area}_scenario_comparison.csv",
                mime="text/csv",
            )
    else:
        st.write("Please select at least two environments for comparison.")

def multi_env_compare_main():
    st.title("Multi-Environment Comparison")
    
    # Get the number of environments from the user
    num_environments = st.number_input("Enter the number of environments", min_value=1, value=1, step=1)

    # Initialize list to store uploaded dataframes
    uploaded_dataframes = []

    # Loop through the number of environments and create file uploaders
    for i in range(num_environments):
        uploaded_files = st.file_uploader(f"Upload CSV files for Environment {i + 1}", type="csv", accept_multiple_files=True)
        
        for uploaded_file in uploaded_files:
            # Preprocess the uploaded CSV file
            data = preprocess_uploaded_file(uploaded_file)
            
            # Append the dataframe to the list
            uploaded_dataframes.append(data)
    
    # Check if any files were uploaded
    if uploaded_dataframes:
        # Perform analysis for uploaded data
        perform_multi_env_analysis(uploaded_dataframes)
    else:
        st.write("Please upload at least one CSV file.")

if __name__ == "__main__":
    multi_env_compare_main()