|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.decomposition import PCA |
|
from sklearn.cluster import KMeans, DBSCAN |
|
from sklearn.metrics import silhouette_score |
|
from sklearn.neighbors import LocalOutlierFactor |
|
from sklearn.ensemble import IsolationForest |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
|
|
st.title('Data Analysis with Streamlit') |
|
|
|
|
|
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") |
|
|
|
if uploaded_file is not None: |
|
|
|
df = pd.read_csv(uploaded_file) |
|
st.write("Data loaded successfully.") |
|
st.write(df.head()) |
|
|
|
|
|
numeric_df = df.select_dtypes(include=[np.number]) |
|
|
|
|
|
scaler = StandardScaler() |
|
scaled_data = scaler.fit_transform(numeric_df) |
|
|
|
|
|
pca = PCA(n_components=2) |
|
pca_result = pca.fit_transform(scaled_data) |
|
fig, ax = plt.subplots() |
|
ax.scatter(pca_result[:, 0], pca_result[:, 1], c='blue', edgecolor='k', s=50) |
|
ax.set_title('PCA Result') |
|
ax.set_xlabel('Principal Component 1') |
|
ax.set_ylabel('Principal Component 2') |
|
st.pyplot(fig) |
|
|
|
|
|
kmeans = KMeans(n_clusters=3, random_state=42) |
|
clusters = kmeans.fit_predict(scaled_data) |
|
fig, ax = plt.subplots() |
|
ax.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='viridis', edgecolor='k', s=50) |
|
ax.set_title('KMeans Clustering') |
|
ax.set_xlabel('Principal Component 1') |
|
ax.set_ylabel('Principal Component 2') |
|
st.pyplot(fig) |
|
|
|
|
|
silhouette_avg = silhouette_score(scaled_data, clusters) |
|
st.write('Silhouette Score:', silhouette_avg) |
|
|
|
|
|
lof = LocalOutlierFactor(n_neighbors=20) |
|
lof_labels = lof.fit_predict(scaled_data) |
|
lof_outliers = np.sum(lof_labels == -1) |
|
st.write("Number of outliers detected by LOF:", lof_outliers) |
|
|
|
|
|
isolation_forest = IsolationForest(contamination=0.1, random_state=42) |
|
isolation_labels = isolation_forest.fit_predict(scaled_data) |
|
isolation_outliers = np.sum(isolation_labels == -1) |
|
st.write("Number of outliers detected by Isolation Forest:", isolation_outliers) |
|
|
|
|
|
dbscan = DBSCAN(eps=0.5, min_samples=5) |
|
dbscan_labels = dbscan.fit_predict(scaled_data) |
|
silhouette_dbscan = silhouette_score(scaled_data, dbscan_labels) |
|
st.write("DBSCAN Silhouette Score:", silhouette_dbscan) |
|
|
|
|
|
print("Streamlit app code generated.") |