Spaces:
Sleeping
Sleeping
Spencer525
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -7,26 +7,43 @@ from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
|
|
7 |
from sklearn.metrics import silhouette_score
|
8 |
import joblib
|
9 |
import matplotlib.pyplot as plt
|
|
|
10 |
|
11 |
-
# Function to
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def process_data(file, scaler_option):
|
13 |
df = pd.read_csv(file)
|
14 |
features = ['RI4', 'RI5', 'RI7', 'RI9']
|
15 |
df_selected = df[features].fillna(df.mean())
|
|
|
|
|
|
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Apply chosen scaler
|
18 |
if scaler_option == 'StandardScaler':
|
19 |
scaler = StandardScaler()
|
20 |
elif scaler_option == 'MinMaxScaler':
|
21 |
scaler = MinMaxScaler()
|
22 |
|
23 |
-
scaled_data = scaler.fit_transform(
|
24 |
|
25 |
# PCA Transformation (2 components for visualization)
|
26 |
pca = PCA(n_components=2)
|
27 |
pca_data = pca.fit_transform(scaled_data)
|
28 |
|
29 |
-
return pca_data
|
30 |
|
31 |
# Set up the Streamlit page
|
32 |
st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models")
|
@@ -58,43 +75,48 @@ if data_file is not None:
|
|
58 |
dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5)
|
59 |
|
60 |
# Load and process the data
|
61 |
-
pca_data = process_data(data_file, scaler_option)
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
ax = ax.flatten()
|
66 |
-
|
67 |
-
# K-means Clustering
|
68 |
-
if kmeans_model is not None:
|
69 |
-
kmeans = joblib.load(kmeans_model)
|
70 |
-
kmeans.set_params(n_clusters=kmeans_clusters)
|
71 |
-
kmeans_labels = kmeans.fit_predict(pca_data)
|
72 |
-
ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis')
|
73 |
-
ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})")
|
74 |
-
else:
|
75 |
-
ax[0].set_title("K-means Model Missing")
|
76 |
-
|
77 |
-
# Hierarchical Clustering
|
78 |
-
if hierarchical_model is not None:
|
79 |
-
hierarchical = joblib.load(hierarchical_model)
|
80 |
-
hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage)
|
81 |
-
hierarchical_labels = hierarchical.fit_predict(pca_data)
|
82 |
-
ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis')
|
83 |
-
ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})")
|
84 |
-
else:
|
85 |
-
ax[1].set_title("Hierarchical Model Missing")
|
86 |
-
|
87 |
-
# DBSCAN Clustering
|
88 |
-
if dbscan_model is not None:
|
89 |
-
dbscan = joblib.load(dbscan_model)
|
90 |
-
dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples)
|
91 |
-
dbscan_labels = dbscan.fit_predict(pca_data)
|
92 |
-
ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis')
|
93 |
-
ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})")
|
94 |
-
else:
|
95 |
-
ax[2].set_title("DBSCAN Model Missing")
|
96 |
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
else:
|
100 |
st.info("Please upload the detectors report file to proceed.")
|
|
|
7 |
from sklearn.metrics import silhouette_score
|
8 |
import joblib
|
9 |
import matplotlib.pyplot as plt
|
10 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
11 |
|
12 |
+
# Function to calculate VIF and filter features with VIF < 10
|
13 |
+
def calculate_vif(df):
|
14 |
+
vif_data = pd.DataFrame()
|
15 |
+
vif_data['feature'] = df.columns
|
16 |
+
vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
|
17 |
+
return vif_data[vif_data['VIF'] < 10]['feature'].tolist()
|
18 |
+
|
19 |
+
# Function to load and process data (including VIF and PCA)
|
20 |
def process_data(file, scaler_option):
|
21 |
df = pd.read_csv(file)
|
22 |
features = ['RI4', 'RI5', 'RI7', 'RI9']
|
23 |
df_selected = df[features].fillna(df.mean())
|
24 |
+
|
25 |
+
# Calculate VIF and filter features with VIF < 10
|
26 |
+
selected_features = calculate_vif(df_selected)
|
27 |
|
28 |
+
if not selected_features:
|
29 |
+
st.error("No features with VIF < 10 found. Please review the data.")
|
30 |
+
return None
|
31 |
+
|
32 |
+
df_filtered = df[selected_features]
|
33 |
+
|
34 |
# Apply chosen scaler
|
35 |
if scaler_option == 'StandardScaler':
|
36 |
scaler = StandardScaler()
|
37 |
elif scaler_option == 'MinMaxScaler':
|
38 |
scaler = MinMaxScaler()
|
39 |
|
40 |
+
scaled_data = scaler.fit_transform(df_filtered)
|
41 |
|
42 |
# PCA Transformation (2 components for visualization)
|
43 |
pca = PCA(n_components=2)
|
44 |
pca_data = pca.fit_transform(scaled_data)
|
45 |
|
46 |
+
return pca_data, selected_features
|
47 |
|
48 |
# Set up the Streamlit page
|
49 |
st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models")
|
|
|
75 |
dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5)
|
76 |
|
77 |
# Load and process the data
|
78 |
+
pca_data, selected_features = process_data(data_file, scaler_option)
|
79 |
+
|
80 |
+
if pca_data is not None:
|
81 |
+
st.write(f"Selected features after VIF filtering: {selected_features}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
# Prepare the plot
|
84 |
+
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
|
85 |
+
ax = ax.flatten()
|
86 |
+
|
87 |
+
# K-means Clustering
|
88 |
+
if kmeans_model is not None:
|
89 |
+
kmeans = joblib.load(kmeans_model)
|
90 |
+
kmeans.set_params(n_clusters=kmeans_clusters)
|
91 |
+
kmeans_labels = kmeans.fit_predict(pca_data)
|
92 |
+
ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis')
|
93 |
+
ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})")
|
94 |
+
else:
|
95 |
+
ax[0].set_title("K-means Model Missing")
|
96 |
+
|
97 |
+
# Hierarchical Clustering
|
98 |
+
if hierarchical_model is not None:
|
99 |
+
hierarchical = joblib.load(hierarchical_model)
|
100 |
+
hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage)
|
101 |
+
hierarchical_labels = hierarchical.fit_predict(pca_data)
|
102 |
+
ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis')
|
103 |
+
ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})")
|
104 |
+
else:
|
105 |
+
ax[1].set_title("Hierarchical Model Missing")
|
106 |
+
|
107 |
+
# DBSCAN Clustering
|
108 |
+
if dbscan_model is not None:
|
109 |
+
dbscan = joblib.load(dbscan_model)
|
110 |
+
dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples)
|
111 |
+
dbscan_labels = dbscan.fit_predict(pca_data)
|
112 |
+
ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis')
|
113 |
+
ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})")
|
114 |
+
else:
|
115 |
+
ax[2].set_title("DBSCAN Model Missing")
|
116 |
+
|
117 |
+
# Display the plots
|
118 |
+
st.pyplot(fig)
|
119 |
+
else:
|
120 |
+
st.warning("Data processing failed due to VIF constraints.")
|
121 |
else:
|
122 |
st.info("Please upload the detectors report file to proceed.")
|