simran0608's picture
Update app.py
346c8ea verified
raw
history blame
7.05 kB
import streamlit as st
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
import matplotlib.pyplot as plt
from src.data_preparation import preprocess_data
from src.clustering import perform_clustering, plot_clusters
from src.feature_selection import select_features_pca, select_features_rfe, select_features_rf
import os
from sklearn.preprocessing import StandardScaler
def load_data(dataset_choice):
if dataset_choice == "Insurance":
data = pd.read_sas('a2z_insurance.sas7bdat',encoding='latin1')
elif dataset_choice == "Retail":
data = pd.read_csv('retaildata.csv', encoding='latin1')
elif dataset_choice == "Banking":
data = pd.read_csv('bankingdata.csv', encoding='latin1')
return data
return data
# Function to summarize cluster characteristics
def summarize_cluster_characteristics(clustered_data, labels, cluster_number):
cluster_data = clustered_data[labels == cluster_number]
summary = cluster_data.mean().to_dict()
return summary
# Function to display Business Understanding section
def display_business_understanding():
st.subheader("Business Objective")
st.write("""
###### Customer segmentation is a fundamental task in marketing and customer relationship management. With the advancements in data analytics and machine learning, it is now possible to group customers into distinct segments with a high degree of precision, allowing businesses to tailor their marketing strategies and offerings to each segment's unique needs and preferences.
###### Through this customer segmentation, businesses can achieve:
- **Personalization**: Tailoring marketing strategies to meet the unique needs of each segment.
- **Optimization**: Efficient allocation of marketing resources.
- **Insight**: Gaining a deeper understanding of the customer base.
- **Engagement**: Enhancing customer engagement and satisfaction.
###### => Problem/Requirement: Utilize machine learning and data analysis techniques in Python to perform customer segmentation.
""")
st.image("Customer-Segmentation.png", caption="Customer Segmentation", use_column_width=True)
# Function to display Dataset section
def display_dataset_selection():
dataset_choice = st.selectbox("Select Dataset", ("Insurance", "Retail", "Banking"))
data = load_data(dataset_choice)
st.write(f"Dataset: {dataset_choice}")
st.write("Number of rows:", data.shape[0])
st.write("Number of columns:", data.shape[1])
st.write("First five rows of the data:")
st.write(data.head())
return data
# Function to display Modeling & Evaluation section
def display_modeling_evaluation():
dataset_choice = st.selectbox("Select Dataset", ("Insurance", "Retail", "Banking"))
data = load_data(dataset_choice)
data = preprocess_data(data)
# Sidebar for feature selection and clustering method
st.sidebar.header("Feature Selection and Clustering Method")
feature_selection_method = st.sidebar.selectbox("Select feature selection method", ('PCA', 'RFE', 'Random Forest'))
n_clusters = st.sidebar.slider("Number of clusters", min_value=2, max_value=10, value=3)
if feature_selection_method == 'PCA':
n_components = st.sidebar.slider("Number of PCA components", min_value=2, max_value=10, value=5)
elif feature_selection_method in ['RFE', 'Random Forest']:
n_features_to_select = st.sidebar.slider("Number of features to select", min_value=2, max_value=10, value=5)
# Perform clustering on button click
if st.sidebar.button("Cluster"):
if feature_selection_method == 'PCA':
selected_data, selected_features = select_features_pca(data, n_components)
elif feature_selection_method == 'RFE':
selected_data, selected_features = select_features_rfe(data, n_features_to_select)
elif feature_selection_method == 'Random Forest':
selected_data, selected_features = select_features_rf(data, n_features_to_select)
st.write(f"Selected Features: {selected_features}")
clustered_data, score, df_value_scaled, labels, model = perform_clustering(selected_data, n_clusters)
st.write(f"Number of Clusters: {n_clusters}")
st.write(f"Silhouette Score: {score}")
st.write("Clustered Data")
st.write(clustered_data)
st.write("Cluster Visualization")
plot_clusters(df_value_scaled, labels)
# Store selected features and model in session state
st.session_state.selected_features = selected_features
st.session_state.model = model
st.session_state.clustered_data = clustered_data
st.session_state.labels = labels
st.session_state.df_value_scaled = df_value_scaled
# Predict new data based on selected features
if 'selected_features' in st.session_state and 'model' in st.session_state:
st.write("### Predict Cluster")
# Use st.form to handle input fields
with st.form(key='prediction_form'):
user_input = {}
for feature in st.session_state.selected_features:
user_input[feature] = st.number_input(f'Enter {feature}', value=0.0)
submit_button = st.form_submit_button(label='Predict')
if submit_button:
user_df = pd.DataFrame(user_input, index=[0])
scaler = StandardScaler()
user_df_scaled = scaler.fit_transform(user_df)
cluster = st.session_state.model.predict(user_df_scaled)
st.write(f'The predicted cluster for the input data is: {cluster[0]}')
# Get the clustered data and labels from session state
clustered_data = st.session_state.clustered_data
labels = st.session_state.labels
df_value_scaled = st.session_state.df_value_scaled
# Summarize cluster characteristics
summary = summarize_cluster_characteristics(clustered_data, labels, cluster[0])
# Generate and display the inference
inference = f"Based on the input features, the customer belongs to Cluster {cluster[0]}, which is characterized by the following average values:\n"
for feature, value in summary.items():
inference += f"- {feature}: {value:.2f}\n"
st.write(inference)
plot_clusters(df_value_scaled, labels, new_data_point=user_df_scaled)
# Main app structure
def main():
st.title("Customer Segmentation Demo")
st.header("Customer Segmentation")
# Sidebar menu options
menu = ["Business Understanding", "Dataset", "Modeling & Prediction"]
choice = st.sidebar.selectbox('Menu', menu)
if choice == 'Business Understanding':
display_business_understanding()
elif choice == 'Dataset':
display_dataset_selection()
elif choice == 'Modeling & Prediction':
display_modeling_evaluation()
if __name__ == "__main__":
main()