# Create a Streamlit app for data analysis import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA from sklearn.cluster import KMeans, DBSCAN from sklearn.metrics import silhouette_score from sklearn.neighbors import LocalOutlierFactor from sklearn.ensemble import IsolationForest from sklearn.preprocessing import StandardScaler # Streamlit app st.title('Data Analysis with Streamlit') # File uploader uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: # Read the uploaded CSV file df = pd.read_csv(uploaded_file) st.write("Data loaded successfully.") st.write(df.head()) # Exclude non-numeric columns for analysis numeric_df = df.select_dtypes(include=[np.number]) # Standardize the data scaler = StandardScaler() scaled_data = scaler.fit_transform(numeric_df) # PCA pca = PCA(n_components=2) pca_result = pca.fit_transform(scaled_data) fig, ax = plt.subplots() ax.scatter(pca_result[:, 0], pca_result[:, 1], c='blue', edgecolor='k', s=50) ax.set_title('PCA Result') ax.set_xlabel('Principal Component 1') ax.set_ylabel('Principal Component 2') st.pyplot(fig) # KMeans Clustering kmeans = KMeans(n_clusters=3, random_state=42) clusters = kmeans.fit_predict(scaled_data) fig, ax = plt.subplots() ax.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='viridis', edgecolor='k', s=50) ax.set_title('KMeans Clustering') ax.set_xlabel('Principal Component 1') ax.set_ylabel('Principal Component 2') st.pyplot(fig) # Silhouette Score silhouette_avg = silhouette_score(scaled_data, clusters) st.write('Silhouette Score:', silhouette_avg) # Local Outlier Factor (LOF) lof = LocalOutlierFactor(n_neighbors=20) lof_labels = lof.fit_predict(scaled_data) lof_outliers = np.sum(lof_labels == -1) st.write("Number of outliers detected by LOF:", lof_outliers) # Isolation Forest isolation_forest = IsolationForest(contamination=0.1, random_state=42) isolation_labels = isolation_forest.fit_predict(scaled_data) isolation_outliers = np.sum(isolation_labels == -1) st.write("Number of outliers detected by Isolation Forest:", isolation_outliers) # DBSCAN dbscan = DBSCAN(eps=0.5, min_samples=5) dbscan_labels = dbscan.fit_predict(scaled_data) silhouette_dbscan = silhouette_score(scaled_data, dbscan_labels) st.write("DBSCAN Silhouette Score:", silhouette_dbscan) # To run this Streamlit app, save it as a .py file and execute it using the command: streamlit run .py print("Streamlit app code generated.")