import streamlit as st import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.decomposition import PCA from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN from sklearn.metrics import silhouette_score import joblib import matplotlib.pyplot as plt from statsmodels.stats.outliers_influence import variance_inflation_factor # Function to calculate VIF and filter features with VIF < 10 def calculate_vif(df): vif_data = pd.DataFrame() vif_data['feature'] = df.columns vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])] return vif_data[vif_data['VIF'] < 10]['feature'].tolist() # Function to load and process data (including VIF and PCA) def process_data(file, scaler_option): df = pd.read_csv(file) features = ['RI4', 'RI5', 'RI7', 'RI9'] df_selected = df[features].fillna(df.mean()) # Calculate VIF and filter features with VIF < 10 selected_features = calculate_vif(df_selected) if not selected_features: st.error("No features with VIF < 10 found. Please review the data.") return None df_filtered = df[selected_features] # Apply chosen scaler if scaler_option == 'StandardScaler': scaler = StandardScaler() elif scaler_option == 'MinMaxScaler': scaler = MinMaxScaler() scaled_data = scaler.fit_transform(df_filtered) # PCA Transformation (2 components for visualization) pca = PCA(n_components=2) pca_data = pca.fit_transform(scaled_data) return pca_data, selected_features # Set up the Streamlit page st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models") # Upload the detectors report CSV file data_file = st.file_uploader("Upload the detectors report file (.csv)", type="csv") # Upload the models kmeans_model = st.file_uploader("Upload the K-means model (.sav)", type="sav") hierarchical_model = st.file_uploader("Upload the Hierarchical Clustering model (.sav)", type="sav") dbscan_model = st.file_uploader("Upload the DBSCAN model (.sav)", type="sav") # Parameter selection for K-means, Hierarchical Clustering, and DBSCAN if data_file is not None: st.sidebar.header("Adjust Clustering Parameters") # Scaler selection scaler_option = st.sidebar.selectbox("Choose Scaler", ("StandardScaler", "MinMaxScaler")) # K-means parameters kmeans_clusters = st.sidebar.slider("K-means: Number of Clusters", min_value=2, max_value=10, value=3) # Hierarchical Clustering parameters hierarchical_clusters = st.sidebar.slider("Hierarchical: Number of Clusters", min_value=2, max_value=10, value=3) linkage = st.sidebar.selectbox("Hierarchical: Linkage Method", ["ward", "complete", "average", "single"]) # DBSCAN parameters dbscan_eps = st.sidebar.number_input("DBSCAN: Epsilon", min_value=0.1, max_value=10.0, value=0.5, step=0.1) dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5) # Load and process the data pca_data, selected_features = process_data(data_file, scaler_option) if pca_data is not None: st.write(f"Selected features after VIF filtering: {selected_features}") # Prepare the plot fig, ax = plt.subplots(1, 3, figsize=(15, 5)) ax = ax.flatten() # K-means Clustering if kmeans_model is not None: kmeans = joblib.load(kmeans_model) kmeans.set_params(n_clusters=kmeans_clusters) kmeans_labels = kmeans.fit_predict(pca_data) ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis') ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})") else: ax[0].set_title("K-means Model Missing") # Hierarchical Clustering if hierarchical_model is not None: hierarchical = joblib.load(hierarchical_model) hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage) hierarchical_labels = hierarchical.fit_predict(pca_data) ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis') ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})") else: ax[1].set_title("Hierarchical Model Missing") # DBSCAN Clustering if dbscan_model is not None: dbscan = joblib.load(dbscan_model) dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples) dbscan_labels = dbscan.fit_predict(pca_data) ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis') ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})") else: ax[2].set_title("DBSCAN Model Missing") # Display the plots st.pyplot(fig) else: st.warning("Data processing failed due to VIF constraints.") else: st.info("Please upload the detectors report file to proceed.")