Spencer525
commited on
Commit
•
d80e71f
1
Parent(s):
86685cd
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Create a Streamlit app for data analysis
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import seaborn as sns
|
7 |
+
from sklearn.decomposition import PCA
|
8 |
+
from sklearn.cluster import KMeans, DBSCAN
|
9 |
+
from sklearn.metrics import silhouette_score
|
10 |
+
from sklearn.neighbors import LocalOutlierFactor
|
11 |
+
from sklearn.ensemble import IsolationForest
|
12 |
+
from sklearn.preprocessing import StandardScaler
|
13 |
+
|
14 |
+
# Streamlit app
|
15 |
+
st.title('Data Analysis with Streamlit')
|
16 |
+
|
17 |
+
# File uploader
|
18 |
+
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
|
19 |
+
|
20 |
+
if uploaded_file is not None:
|
21 |
+
# Read the uploaded CSV file
|
22 |
+
df = pd.read_csv(uploaded_file)
|
23 |
+
st.write("Data loaded successfully.")
|
24 |
+
st.write(df.head())
|
25 |
+
|
26 |
+
# Exclude non-numeric columns for analysis
|
27 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
28 |
+
|
29 |
+
# Standardize the data
|
30 |
+
scaler = StandardScaler()
|
31 |
+
scaled_data = scaler.fit_transform(numeric_df)
|
32 |
+
|
33 |
+
# PCA
|
34 |
+
pca = PCA(n_components=2)
|
35 |
+
pca_result = pca.fit_transform(scaled_data)
|
36 |
+
fig, ax = plt.subplots()
|
37 |
+
ax.scatter(pca_result[:, 0], pca_result[:, 1], c='blue', edgecolor='k', s=50)
|
38 |
+
ax.set_title('PCA Result')
|
39 |
+
ax.set_xlabel('Principal Component 1')
|
40 |
+
ax.set_ylabel('Principal Component 2')
|
41 |
+
st.pyplot(fig)
|
42 |
+
|
43 |
+
# KMeans Clustering
|
44 |
+
kmeans = KMeans(n_clusters=3, random_state=42)
|
45 |
+
clusters = kmeans.fit_predict(scaled_data)
|
46 |
+
fig, ax = plt.subplots()
|
47 |
+
ax.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='viridis', edgecolor='k', s=50)
|
48 |
+
ax.set_title('KMeans Clustering')
|
49 |
+
ax.set_xlabel('Principal Component 1')
|
50 |
+
ax.set_ylabel('Principal Component 2')
|
51 |
+
st.pyplot(fig)
|
52 |
+
|
53 |
+
# Silhouette Score
|
54 |
+
silhouette_avg = silhouette_score(scaled_data, clusters)
|
55 |
+
st.write('Silhouette Score:', silhouette_avg)
|
56 |
+
|
57 |
+
# Local Outlier Factor (LOF)
|
58 |
+
lof = LocalOutlierFactor(n_neighbors=20)
|
59 |
+
lof_labels = lof.fit_predict(scaled_data)
|
60 |
+
lof_outliers = np.sum(lof_labels == -1)
|
61 |
+
st.write("Number of outliers detected by LOF:", lof_outliers)
|
62 |
+
|
63 |
+
# Isolation Forest
|
64 |
+
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
|
65 |
+
isolation_labels = isolation_forest.fit_predict(scaled_data)
|
66 |
+
isolation_outliers = np.sum(isolation_labels == -1)
|
67 |
+
st.write("Number of outliers detected by Isolation Forest:", isolation_outliers)
|
68 |
+
|
69 |
+
# DBSCAN
|
70 |
+
dbscan = DBSCAN(eps=0.5, min_samples=5)
|
71 |
+
dbscan_labels = dbscan.fit_predict(scaled_data)
|
72 |
+
silhouette_dbscan = silhouette_score(scaled_data, dbscan_labels)
|
73 |
+
st.write("DBSCAN Silhouette Score:", silhouette_dbscan)
|
74 |
+
|
75 |
+
# To run this Streamlit app, save it as a .py file and execute it using the command: streamlit run <filename>.py
|
76 |
+
print("Streamlit app code generated.")
|