import streamlit as st import pandas as pd import seaborn as sns import numpy as np import pickle import matplotlib.pyplot as plt from src.data_preparation import preprocess_data from src.clustering import perform_clustering, plot_clusters from src.feature_selection import select_features_pca, select_features_rfe, select_features_rf import os from sklearn.preprocessing import StandardScaler def load_data(dataset_choice): if dataset_choice == "Insurance": data = pd.read_sas('a2z_insurance.sas7bdat',encoding='latin1') elif dataset_choice == "Retail": data = pd.read_csv('retaildata.csv', encoding='latin1') elif dataset_choice == "Banking": data = pd.read_csv('bankingdata.csv', encoding='latin1') return data return data # Function to summarize cluster characteristics def summarize_cluster_characteristics(clustered_data, labels, cluster_number): cluster_data = clustered_data[labels == cluster_number] summary = cluster_data.mean().to_dict() return summary # Function to display Business Understanding section def display_business_understanding(): st.subheader("Business Objective") st.write(""" ###### Customer segmentation is a fundamental task in marketing and customer relationship management. With the advancements in data analytics and machine learning, it is now possible to group customers into distinct segments with a high degree of precision, allowing businesses to tailor their marketing strategies and offerings to each segment's unique needs and preferences. ###### Through this customer segmentation, businesses can achieve: - **Personalization**: Tailoring marketing strategies to meet the unique needs of each segment. - **Optimization**: Efficient allocation of marketing resources. - **Insight**: Gaining a deeper understanding of the customer base. - **Engagement**: Enhancing customer engagement and satisfaction. ###### => Problem/Requirement: Utilize machine learning and data analysis techniques in Python to perform customer segmentation. """) st.image("Customer-Segmentation.png", caption="Customer Segmentation", use_column_width=True) # Function to display Dataset section def display_dataset_selection(): dataset_choice = st.selectbox("Select Dataset", ("Insurance", "Retail", "Banking")) data = load_data(dataset_choice) st.write(f"Dataset: {dataset_choice}") st.write("Number of rows:", data.shape[0]) st.write("Number of columns:", data.shape[1]) st.write("First five rows of the data:") st.write(data.head()) return data # Function to display Modeling & Evaluation section def display_modeling_evaluation(): dataset_choice = st.selectbox("Select Dataset", ("Insurance", "Retail", "Banking")) data = load_data(dataset_choice) data = preprocess_data(data) # Sidebar for feature selection and clustering method st.sidebar.header("Feature Selection and Clustering Method") feature_selection_method = st.sidebar.selectbox("Select feature selection method", ('PCA', 'RFE', 'Random Forest')) n_clusters = st.sidebar.slider("Number of clusters", min_value=2, max_value=10, value=3) if feature_selection_method == 'PCA': n_components = st.sidebar.slider("Number of PCA components", min_value=2, max_value=10, value=5) elif feature_selection_method in ['RFE', 'Random Forest']: n_features_to_select = st.sidebar.slider("Number of features to select", min_value=2, max_value=10, value=5) # Perform clustering on button click if st.sidebar.button("Cluster"): if feature_selection_method == 'PCA': selected_data, selected_features = select_features_pca(data, n_components) elif feature_selection_method == 'RFE': selected_data, selected_features = select_features_rfe(data, n_features_to_select) elif feature_selection_method == 'Random Forest': selected_data, selected_features = select_features_rf(data, n_features_to_select) st.write(f"Selected Features: {selected_features}") clustered_data, score, df_value_scaled, labels, model = perform_clustering(selected_data, n_clusters) st.write(f"Number of Clusters: {n_clusters}") st.write(f"Silhouette Score: {score}") st.write("Clustered Data") st.write(clustered_data) st.write("Cluster Visualization") plot_clusters(df_value_scaled, labels) # Store selected features and model in session state st.session_state.selected_features = selected_features st.session_state.model = model st.session_state.clustered_data = clustered_data st.session_state.labels = labels st.session_state.df_value_scaled = df_value_scaled # Predict new data based on selected features if 'selected_features' in st.session_state and 'model' in st.session_state: st.write("### Predict Cluster") # Use st.form to handle input fields with st.form(key='prediction_form'): user_input = {} for feature in st.session_state.selected_features: user_input[feature] = st.number_input(f'Enter {feature}', value=0.0) submit_button = st.form_submit_button(label='Predict') if submit_button: user_df = pd.DataFrame(user_input, index=[0]) scaler = StandardScaler() user_df_scaled = scaler.fit_transform(user_df) cluster = st.session_state.model.predict(user_df_scaled) st.write(f'The predicted cluster for the input data is: {cluster[0]}') # Get the clustered data and labels from session state clustered_data = st.session_state.clustered_data labels = st.session_state.labels df_value_scaled = st.session_state.df_value_scaled # Summarize cluster characteristics summary = summarize_cluster_characteristics(clustered_data, labels, cluster[0]) # Generate and display the inference inference = f"Based on the input features, the customer belongs to Cluster {cluster[0]}, which is characterized by the following average values:\n" for feature, value in summary.items(): inference += f"- {feature}: {value:.2f}\n" st.write(inference) plot_clusters(df_value_scaled, labels, new_data_point=user_df_scaled) # Main app structure def main(): st.title("Customer Segmentation Demo") st.header("Customer Segmentation") # Sidebar menu options menu = ["Business Understanding", "Dataset", "Modeling & Prediction"] choice = st.sidebar.selectbox('Menu', menu) if choice == 'Business Understanding': display_business_understanding() elif choice == 'Dataset': display_dataset_selection() elif choice == 'Modeling & Prediction': display_modeling_evaluation() if __name__ == "__main__": main()