Spencer525 commited on
Commit
3ca4800
·
verified ·
1 Parent(s): 54f1463

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
5
+ from sklearn.metrics import silhouette_score
6
+ from sklearn.preprocessing import StandardScaler
7
+ from statsmodels.tsa.arima.model import ARIMA
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+
11
+ # Streamlit app title
12
+ st.title('Clustering and Time Series Analysis')
13
+
14
+ # Step 1: Upload CSV file
15
+ uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
16
+
17
+ if uploaded_file is not None:
18
+ data = pd.read_csv(uploaded_file)
19
+ st.write("Dataset Preview:", data.head())
20
+
21
+ # Step 2: Data Preprocessing
22
+ # Selecting only numerical columns for clustering
23
+ numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
24
+ st.write("Numerical columns for clustering:", numerical_cols)
25
+
26
+ # Option to scale data or not
27
+ scale_data = st.checkbox("Scale Data", value=True)
28
+ if scale_data:
29
+ scaler = StandardScaler()
30
+ data_scaled = scaler.fit_transform(data[numerical_cols])
31
+ else:
32
+ data_scaled = data[numerical_cols].values
33
+
34
+ # Step 3: Clustering Algorithm Selection
35
+ clustering_method = st.selectbox("Choose a clustering method", ["K-Means", "Hierarchical Clustering", "DBSCAN"])
36
+
37
+ if clustering_method == "K-Means":
38
+ k_range = st.slider("Select number of clusters for K-Means", min_value=2, max_value=7, value=3)
39
+ kmeans = KMeans(n_clusters=k_range, random_state=42)
40
+ cluster_labels = kmeans.fit_predict(data_scaled)
41
+ silhouette_avg = silhouette_score(data_scaled, cluster_labels)
42
+ st.write(f"K-Means Silhouette Score for {k_range} clusters: {silhouette_avg}")
43
+
44
+ elif clustering_method == "Hierarchical Clustering":
45
+ k_range = st.slider("Select number of clusters for Hierarchical Clustering", min_value=2, max_value=7, value=3)
46
+ hierarchical = AgglomerativeClustering(n_clusters=k_range)
47
+ cluster_labels = hierarchical.fit_predict(data_scaled)
48
+ silhouette_avg = silhouette_score(data_scaled, cluster_labels)
49
+ st.write(f"Hierarchical Clustering Silhouette Score for {k_range} clusters: {silhouette_avg}")
50
+
51
+ elif clustering_method == "DBSCAN":
52
+ eps_value = st.slider("Select eps value for DBSCAN", min_value=0.1, max_value=2.0, value=0.5)
53
+ min_samples_value = st.slider("Select minimum samples for DBSCAN", min_value=1, max_value=10, value=5)
54
+ dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
55
+ cluster_labels = dbscan.fit_predict(data_scaled)
56
+
57
+ # Check if DBSCAN found valid clusters
58
+ if len(set(cluster_labels)) > 1:
59
+ silhouette_avg = silhouette_score(data_scaled, cluster_labels)
60
+ st.write(f"DBSCAN Silhouette Score: {silhouette_avg}")
61
+ else:
62
+ st.write("DBSCAN did not form valid clusters. Try adjusting eps or min_samples.")
63
+
64
+ # Step 4: Visualize the clusters if valid
65
+ if len(set(cluster_labels)) > 1:
66
+ st.write("Cluster Labels:", np.unique(cluster_labels))
67
+ sns.scatterplot(x=data_scaled[:, 0], y=data_scaled[:, 1], hue=cluster_labels, palette='Set1')
68
+ st.pyplot(plt)
69
+
70
+ # Step 5: ARIMA Time Series Analysis
71
+ # Checking if there are any time-related columns
72
+ time_series_col = None
73
+ for col in data.columns:
74
+ if pd.api.types.is_datetime64_any_dtype(data[col]):
75
+ time_series_col = col
76
+ break
77
+
78
+ if time_series_col:
79
+ st.write("Time Series Analysis (ARIMA) on column:", time_series_col)
80
+ time_series_data = data[time_series_col].dropna()
81
+
82
+ # ARIMA model order
83
+ p = st.number_input("ARIMA p value", min_value=0, max_value=5, value=1)
84
+ d = st.number_input("ARIMA d value", min_value=0, max_value=2, value=1)
85
+ q = st.number_input("ARIMA q value", min_value=0, max_value=5, value=1)
86
+
87
+ arima_model = ARIMA(time_series_data, order=(p, d, q))
88
+ arima_result = arima_model.fit()
89
+
90
+ # Display ARIMA result summary
91
+ st.write(arima_result.summary())
92
+
93
+ # Plotting the original and forecast
94
+ fig, ax = plt.subplots()
95
+ arima_result.plot_predict(dynamic=False, ax=ax)
96
+ st.pyplot(fig)
97
+
98
+ # Step 6: Create Silhouette Score Table for K-Means and Hierarchical Clustering
99
+ st.write("### Silhouette Score Table for 2-7 Clusters")
100
+ silhouette_scores = {'Number of Clusters': [], 'K-Means Silhouette Score': [], 'Hierarchical Silhouette Score': []}
101
+
102
+ for n_clusters in range(2, 8):
103
+ # K-Means
104
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
105
+ kmeans_labels = kmeans.fit_predict(data_scaled)
106
+ kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)
107
+
108
+ # Hierarchical
109
+ hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
110
+ hierarchical_labels = hierarchical.fit_predict(data_scaled)
111
+ hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels)
112
+
113
+ silhouette_scores['Number of Clusters'].append(n_clusters)
114
+ silhouette_scores['K-Means Silhouette Score'].append(kmeans_silhouette)
115
+ silhouette_scores['Hierarchical Silhouette Score'].append(hierarchical_silhouette)
116
+
117
+ silhouette_df = pd.DataFrame(silhouette_scores)
118
+ st.write(silhouette_df)