Spencer525 commited on
Commit
40bd51a
·
verified ·
1 Parent(s): bb598be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -40
app.py CHANGED
@@ -7,26 +7,43 @@ from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
7
  from sklearn.metrics import silhouette_score
8
  import joblib
9
  import matplotlib.pyplot as plt
 
10
 
11
- # Function to load and process data (including PCA)
 
 
 
 
 
 
 
12
  def process_data(file, scaler_option):
13
  df = pd.read_csv(file)
14
  features = ['RI4', 'RI5', 'RI7', 'RI9']
15
  df_selected = df[features].fillna(df.mean())
 
 
 
16
 
 
 
 
 
 
 
17
  # Apply chosen scaler
18
  if scaler_option == 'StandardScaler':
19
  scaler = StandardScaler()
20
  elif scaler_option == 'MinMaxScaler':
21
  scaler = MinMaxScaler()
22
 
23
- scaled_data = scaler.fit_transform(df_selected)
24
 
25
  # PCA Transformation (2 components for visualization)
26
  pca = PCA(n_components=2)
27
  pca_data = pca.fit_transform(scaled_data)
28
 
29
- return pca_data
30
 
31
  # Set up the Streamlit page
32
  st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models")
@@ -58,43 +75,48 @@ if data_file is not None:
58
  dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5)
59
 
60
  # Load and process the data
61
- pca_data = process_data(data_file, scaler_option)
62
-
63
- # Prepare the plot
64
- fig, ax = plt.subplots(1, 3, figsize=(15, 5))
65
- ax = ax.flatten()
66
-
67
- # K-means Clustering
68
- if kmeans_model is not None:
69
- kmeans = joblib.load(kmeans_model)
70
- kmeans.set_params(n_clusters=kmeans_clusters)
71
- kmeans_labels = kmeans.fit_predict(pca_data)
72
- ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis')
73
- ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})")
74
- else:
75
- ax[0].set_title("K-means Model Missing")
76
-
77
- # Hierarchical Clustering
78
- if hierarchical_model is not None:
79
- hierarchical = joblib.load(hierarchical_model)
80
- hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage)
81
- hierarchical_labels = hierarchical.fit_predict(pca_data)
82
- ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis')
83
- ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})")
84
- else:
85
- ax[1].set_title("Hierarchical Model Missing")
86
-
87
- # DBSCAN Clustering
88
- if dbscan_model is not None:
89
- dbscan = joblib.load(dbscan_model)
90
- dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples)
91
- dbscan_labels = dbscan.fit_predict(pca_data)
92
- ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis')
93
- ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})")
94
- else:
95
- ax[2].set_title("DBSCAN Model Missing")
96
 
97
- # Display the plots
98
- st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  else:
100
  st.info("Please upload the detectors report file to proceed.")
 
7
  from sklearn.metrics import silhouette_score
8
  import joblib
9
  import matplotlib.pyplot as plt
10
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
11
 
12
+ # Function to calculate VIF and filter features with VIF < 10
13
+ def calculate_vif(df):
14
+ vif_data = pd.DataFrame()
15
+ vif_data['feature'] = df.columns
16
+ vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
17
+ return vif_data[vif_data['VIF'] < 10]['feature'].tolist()
18
+
19
+ # Function to load and process data (including VIF and PCA)
20
  def process_data(file, scaler_option):
21
  df = pd.read_csv(file)
22
  features = ['RI4', 'RI5', 'RI7', 'RI9']
23
  df_selected = df[features].fillna(df.mean())
24
+
25
+ # Calculate VIF and filter features with VIF < 10
26
+ selected_features = calculate_vif(df_selected)
27
 
28
+ if not selected_features:
29
+ st.error("No features with VIF < 10 found. Please review the data.")
30
+ return None
31
+
32
+ df_filtered = df[selected_features]
33
+
34
  # Apply chosen scaler
35
  if scaler_option == 'StandardScaler':
36
  scaler = StandardScaler()
37
  elif scaler_option == 'MinMaxScaler':
38
  scaler = MinMaxScaler()
39
 
40
+ scaled_data = scaler.fit_transform(df_filtered)
41
 
42
  # PCA Transformation (2 components for visualization)
43
  pca = PCA(n_components=2)
44
  pca_data = pca.fit_transform(scaled_data)
45
 
46
+ return pca_data, selected_features
47
 
48
  # Set up the Streamlit page
49
  st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models")
 
75
  dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5)
76
 
77
  # Load and process the data
78
+ pca_data, selected_features = process_data(data_file, scaler_option)
79
+
80
+ if pca_data is not None:
81
+ st.write(f"Selected features after VIF filtering: {selected_features}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Prepare the plot
84
+ fig, ax = plt.subplots(1, 3, figsize=(15, 5))
85
+ ax = ax.flatten()
86
+
87
+ # K-means Clustering
88
+ if kmeans_model is not None:
89
+ kmeans = joblib.load(kmeans_model)
90
+ kmeans.set_params(n_clusters=kmeans_clusters)
91
+ kmeans_labels = kmeans.fit_predict(pca_data)
92
+ ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis')
93
+ ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})")
94
+ else:
95
+ ax[0].set_title("K-means Model Missing")
96
+
97
+ # Hierarchical Clustering
98
+ if hierarchical_model is not None:
99
+ hierarchical = joblib.load(hierarchical_model)
100
+ hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage)
101
+ hierarchical_labels = hierarchical.fit_predict(pca_data)
102
+ ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis')
103
+ ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})")
104
+ else:
105
+ ax[1].set_title("Hierarchical Model Missing")
106
+
107
+ # DBSCAN Clustering
108
+ if dbscan_model is not None:
109
+ dbscan = joblib.load(dbscan_model)
110
+ dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples)
111
+ dbscan_labels = dbscan.fit_predict(pca_data)
112
+ ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis')
113
+ ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})")
114
+ else:
115
+ ax[2].set_title("DBSCAN Model Missing")
116
+
117
+ # Display the plots
118
+ st.pyplot(fig)
119
+ else:
120
+ st.warning("Data processing failed due to VIF constraints.")
121
  else:
122
  st.info("Please upload the detectors report file to proceed.")