from dataclasses import dataclass from typing import Union, cast import numpy as np import streamlit as st import plotly.express as px import pandas as pd from xgboost.sklearn import XGBClassifier from sklearn.linear_model import LogisticRegression from common.data import SplitDataset from common.util import ( model_probability_values_df, apply_threshold_to_probability_values, find_best_threshold_J_statistic, default_status_per_threshold, classification_report_per_threshold, thresh_classification_report_recall_accuracy, ) from common.views import ( streamlit_2columns_metrics_df, streamlit_2columns_metrics_pct_df, ) @dataclass(frozen=True) class Threshold: probability_threshold_selected: float predicted_default_status: pd.Series prediction_probability_df: pd.DataFrame def make_threshold_view( model_name_short: str, model_name: str, ): def view( clf_gbt_model: Union[XGBClassifier, LogisticRegression], split_dataset: SplitDataset, ) -> Threshold: st.subheader("Classification Probability Threshold - User Defined") st.write( f""" The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n Probabilities of defaulting of the loans are compared to a probability threshold.\n A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold. """ ) threshold_gbt_default = st.slider( label="Default Probability Threshold:", min_value=0.0, max_value=1.0, value=0.8, key=f"threshold_{model_name_short}_default", ) clf_prediction_prob_df_gbt = model_probability_values_df( clf_gbt_model, split_dataset.X_test, ) clf_thresh_predicted_default_status_user_gbt = ( apply_threshold_to_probability_values( clf_prediction_prob_df_gbt, threshold_gbt_default, ) ) streamlit_2columns_metrics_df( "# of Predicted Defaults", "# of Predicted Non-Default", clf_thresh_predicted_default_status_user_gbt, ) streamlit_2columns_metrics_pct_df( "% of Loans Predicted to Default", "% of Loans Predicted not to Default", clf_thresh_predicted_default_status_user_gbt, ) st.subheader("J Statistic Driven Classification Probability Threshold") J_statistic_best_threshold = find_best_threshold_J_statistic( split_dataset.y_test, clf_prediction_prob_df_gbt ) st.metric( label="Youden's J statistic calculated best threshold", value=J_statistic_best_threshold, ) clf_thresh_predicted_default_status_Jstatistic_gbt = ( apply_threshold_to_probability_values( clf_prediction_prob_df_gbt, J_statistic_best_threshold, ) ) streamlit_2columns_metrics_df( "# of Predicted Defaults", "# of Predicted Non-Default", clf_thresh_predicted_default_status_Jstatistic_gbt, ) streamlit_2columns_metrics_pct_df( "% of Loans Predicted to Default", "% of Loans Predicted not to Default", clf_thresh_predicted_default_status_Jstatistic_gbt, ) st.subheader( "Recall and Accuracy Tradeoff with given Probability Threshold" ) # Steps # Get list of thresholds # Get default status per threshold # Get classification report per threshold # Get recall, nondef recall, and accuracy per threshold threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist() threshold_default_status_list = default_status_per_threshold( threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"] ) thresh_classification_report_dict = ( classification_report_per_threshold( threshold_list, threshold_default_status_list, split_dataset.y_test, ) ) ( thresh_def_recalls_list, thresh_nondef_recalls_list, thresh_accs_list, ) = thresh_classification_report_recall_accuracy( thresh_classification_report_dict ) namelist = [ "Default Recall", "Non Default Recall", "Accuracy", "Threshold", ] df = pd.DataFrame( [ thresh_def_recalls_list, thresh_nondef_recalls_list, thresh_accs_list, threshold_list, ], index=namelist, ) df = df.T fig2 = px.line( data_frame=df, y=["Default Recall", "Non Default Recall", "Accuracy"], x="Threshold", ) fig2.update_layout( title="Recall and Accuracy score Trade-off with Probability Threshold", xaxis_title="Probability Threshold", yaxis_title="Score", ) fig2.update_yaxes(range=[0.0, 1.0]) st.plotly_chart(fig2) st.subheader("Acceptance Rate Driven Probability Threshold") # Steps # Set acceptance rate # Get default status per threshold # Get classification report per threshold # Get recall, nondef recall, and accuracy per threshold acceptance_rate = ( st.slider( label="% of loans accepted (acceptance rate):", min_value=0, max_value=100, value=85, key=f"acceptance_rate_{model_name_short}", format="%f%%", ) / 100 ) acc_rate_thresh_gbt = np.quantile( clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate ) st.write( f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}" ) figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"]) figa.update_layout( title="Acceptance Rate Threshold vs. Loans Accepted", xaxis_title="Acceptance Rate Threshold", yaxis_title="Loans Accepted", ) figa.update_traces(marker_line_width=1, marker_line_color="white") figa.add_vline( x=acc_rate_thresh_gbt, line_width=3, line_dash="solid", line_color="red", ) st.plotly_chart(figa) clf_thresh_predicted_default_status_acceptance_gbt = ( apply_threshold_to_probability_values( clf_prediction_prob_df_gbt, acc_rate_thresh_gbt, ) ) st.write() st.subheader("Selected Probability Threshold") options = [ "User Defined", "J Statistic Driven", "Acceptance Rate Driven", ] prob_thresh_option = st.radio( label="Selected Probability Threshold", options=options, key=f"{model_name_short}_radio_thresh", ) if prob_thresh_option == "User Defined": prob_thresh_selected_gbt = threshold_gbt_default predicted_default_status_gbt = ( clf_thresh_predicted_default_status_user_gbt ) elif prob_thresh_option == "J Statistic Driven": prob_thresh_selected_gbt = J_statistic_best_threshold predicted_default_status_gbt = ( clf_thresh_predicted_default_status_Jstatistic_gbt ) else: prob_thresh_selected_gbt = acc_rate_thresh_gbt predicted_default_status_gbt = ( clf_thresh_predicted_default_status_acceptance_gbt ) st.write( f"Selected probability threshold is {prob_thresh_selected_gbt}" ) return Threshold( probability_threshold_selected=cast( float, prob_thresh_selected_gbt ), predicted_default_status=predicted_default_status_gbt, prediction_probability_df=clf_prediction_prob_df_gbt, ) return view decision_tree_threshold_view = make_threshold_view("gbt", "decision tree") logistic_threshold_view = make_threshold_view("lg", "logistic")