from typing import Tuple, cast import pandas as pd import streamlit as st from common.data import Dataset, SplitDataset from common.util import ( undersample_training_data, ) from common.views import ( streamlit_2columns_metrics_df_shape, streamlit_2columns_metrics_series, streamlit_2columns_metrics_pct_series, streamlit_2columns_metrics_df, streamlit_2columns_metrics_pct_df, ) # Initialize dataframe session state def initialise_data() -> Tuple[Dataset, SplitDataset]: if "input_data_frame" not in st.session_state: st.session_state.input_data_frame = pd.read_csv( r"./data/processed/cr_loan_w2.csv" ) if "dataset" not in st.session_state: df = cast(pd.DataFrame, st.session_state.input_data_frame) dataset = Dataset( df=df, random_state=123235, test_size=40, ) st.session_state.dataset = dataset else: dataset = st.session_state.dataset st.write( "Assuming data is already cleaned and relevant features (predictors) added." ) with st.expander("Input Dataframe (X and y)"): st.dataframe(dataset.df) streamlit_2columns_metrics_df_shape(dataset.df) st.header("Predictors") possible_columns = dataset.x_values_column_names selected_columns = st.sidebar.multiselect( label="Select Predictors", options=possible_columns, default=possible_columns, ) selected_x_values = dataset.x_values_filtered_columns(selected_columns) st.sidebar.metric( label="# of Predictors Selected", value=selected_x_values.shape[1], delta=None, delta_color="normal", ) with st.expander("Predictors Dataframe (X)"): st.dataframe(selected_x_values) streamlit_2columns_metrics_df_shape(selected_x_values) # 40% of data used for training # 14321 as random seed for reproducability st.header("Split Testing and Training Data") test_size_slider_col, seed_col = st.columns(2) with test_size_slider_col: # Initialize test size dataset.test_size = st.slider( label="Test Size Percentage of Input Dataframe:", min_value=0, max_value=100, value=dataset.test_size, key="init_test_size", format="%f%%", ) with seed_col: dataset.random_state = int( st.number_input(label="Random State:", value=dataset.random_state) ) split_dataset = dataset.train_test_split(selected_x_values) # Series true_status = split_dataset.y_test.to_frame().value_counts() st.sidebar.metric( label="Testing Data # of Actual Default (=1)", value=true_status.get(1), ) st.sidebar.metric( label="Testing Data % of Actual Default", value="{:.0%}".format(true_status.get(1) / true_status.sum()), ) st.sidebar.metric( label="Testing Data # of Actual Non-Default (=0)", value=true_status.get(0), ) st.sidebar.metric( label="Testing Data % of Actual Non-Default", value="{:.0%}".format(true_status.get(0) / true_status.sum()), ) # Concat the testing sets X_y_test = split_dataset.X_y_test X_y_train = split_dataset.X_y_train with st.expander("Testing Dataframe (X and y)"): st.dataframe(X_y_test) streamlit_2columns_metrics_df_shape(X_y_test) streamlit_2columns_metrics_series( "# Defaults(=1) (Testing Data)", "# Non-Defaults(=0) (Testing Data)", true_status, ) streamlit_2columns_metrics_pct_series( "% Defaults (Testing Data)", "% Non-Defaults (Testing Data)", true_status, ) st.header("Training Data") with st.expander("Training Dataframe (X and y)"): st.dataframe(X_y_train) streamlit_2columns_metrics_df_shape(X_y_train) st.subheader("Class Count") streamlit_2columns_metrics_df( "# Defaults (Training Data Class Balance Check)", "# Non-Defaults (Training Data Class Balance Check)", split_dataset.y_train, ) streamlit_2columns_metrics_pct_df( "% Defaults (Training Data Class Balance Check)", "% Non-Defaults (Training Data Class Balance Check)", split_dataset.y_train, ) balance_the_classes = st.radio( label="Balance the Classes:", options=("Yes", "No") ) if balance_the_classes == "Yes": st.subheader("Balanced Classes (by Undersampling)") ( split_dataset.X_train, split_dataset.y_train, _X_y_train, class_balance_default, ) = undersample_training_data(X_y_train, "loan_status", split_dataset) streamlit_2columns_metrics_series( "# Defaults (Training Data with Class Balance)", "# Non-Defaults (Training Data with Class Balance)", class_balance_default, ) streamlit_2columns_metrics_pct_series( "% of Defaults (Training Data with Class Balance)", "% of Non-Defaults (Training Data with Class Balance)", class_balance_default, ) return dataset, split_dataset