import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier from sklearn.inspection import permutation_importance from sklearn.feature_selection import mutual_info_classif import io import base64 # Function to create a download link def get_download_link(data, filename, text): b64 = base64.b64encode(data).decode() href = f'{text}' return href # Function to plot correlation matrix def plot_correlation_matrix(data): plt.figure(figsize=(12, 10)) sns.heatmap(data.corr(), annot=True, cmap='coolwarm', linewidths=0.5) plt.title('Correlation Matrix') plt.tight_layout() st.pyplot(plt) # Function to calculate feature importance def calculate_feature_importance(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) methods = { "Decision Tree": DecisionTreeClassifier(random_state=42), "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42), "XGBoost": XGBClassifier(random_state=42) } importance_dict = {} for name, model in methods.items(): model.fit(X_train_scaled, y_train) importance_dict[name] = model.feature_importances_ # Permutation Importance rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X_train_scaled, y_train) perm_importance = permutation_importance(rf, X_test_scaled, y_test, n_repeats=10, random_state=42) importance_dict["Permutation"] = perm_importance.importances_mean # Mutual Information mi_scores = mutual_info_classif(X_train_scaled, y_train) importance_dict["Mutual Information"] = mi_scores return importance_dict # Streamlit app st.title('Heart Disease Feature Analysis') # File upload uploaded_file = st.file_uploader("Choose a CSV file", type="csv") if uploaded_file is not None: data = pd.read_csv(uploaded_file) st.write("Data Preview:") st.write(data.head()) # Select target variable target_col = st.selectbox("Select the target variable", data.columns) if st.button('Analyze'): X = data.drop(target_col, axis=1) y = data[target_col] # Correlation Matrix st.subheader('Correlation Matrix') plot_correlation_matrix(data) # Download correlation matrix as PNG buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) st.markdown(get_download_link(buf.getvalue(), "correlation_matrix.png", "Download Correlation Matrix as PNG"), unsafe_allow_html=True) # Feature Importance st.subheader('Feature Importance') importance_dict = calculate_feature_importance(X, y) # Create a DataFrame with all feature importances importance_df = pd.DataFrame(importance_dict, index=X.columns) st.write(importance_df) # Download feature importance as XLSX excel_buffer = io.BytesIO() with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer: importance_df.to_excel(writer, sheet_name='Feature Importance') excel_buffer.seek(0) st.markdown(get_download_link(excel_buffer.getvalue(), "feature_importance.xlsx", "Download Feature Importance as XLSX"), unsafe_allow_html=True) else: st.write("Please upload a CSV file to begin the analysis.")