Spaces:
Build error
Build error
from dataclasses import dataclass | |
from typing import Union, cast | |
import numpy as np | |
import streamlit as st | |
import plotly.express as px | |
import pandas as pd | |
from xgboost.sklearn import XGBClassifier | |
from sklearn.linear_model import LogisticRegression | |
from common.data import SplitDataset | |
from common.util import ( | |
model_probability_values_df, | |
apply_threshold_to_probability_values, | |
find_best_threshold_J_statistic, | |
default_status_per_threshold, | |
classification_report_per_threshold, | |
thresh_classification_report_recall_accuracy, | |
) | |
from common.views import ( | |
streamlit_2columns_metrics_df, | |
streamlit_2columns_metrics_pct_df, | |
) | |
class Threshold: | |
probability_threshold_selected: float | |
predicted_default_status: pd.Series | |
prediction_probability_df: pd.DataFrame | |
def make_threshold_view( | |
model_name_short: str, | |
model_name: str, | |
): | |
def view( | |
clf_gbt_model: Union[XGBClassifier, LogisticRegression], | |
split_dataset: SplitDataset, | |
) -> Threshold: | |
st.subheader("Classification Probability Threshold - User Defined") | |
st.write( | |
f""" | |
The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n | |
Probabilities of defaulting of the loans are compared to a probability threshold.\n | |
A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold. | |
""" | |
) | |
threshold_gbt_default = st.slider( | |
label="Default Probability Threshold:", | |
min_value=0.0, | |
max_value=1.0, | |
value=0.8, | |
key=f"threshold_{model_name_short}_default", | |
) | |
clf_prediction_prob_df_gbt = model_probability_values_df( | |
clf_gbt_model, | |
split_dataset.X_test, | |
) | |
clf_thresh_predicted_default_status_user_gbt = ( | |
apply_threshold_to_probability_values( | |
clf_prediction_prob_df_gbt, | |
threshold_gbt_default, | |
) | |
) | |
streamlit_2columns_metrics_df( | |
"# of Predicted Defaults", | |
"# of Predicted Non-Default", | |
clf_thresh_predicted_default_status_user_gbt, | |
) | |
streamlit_2columns_metrics_pct_df( | |
"% of Loans Predicted to Default", | |
"% of Loans Predicted not to Default", | |
clf_thresh_predicted_default_status_user_gbt, | |
) | |
st.subheader("J Statistic Driven Classification Probability Threshold") | |
J_statistic_best_threshold = find_best_threshold_J_statistic( | |
split_dataset.y_test, clf_prediction_prob_df_gbt | |
) | |
st.metric( | |
label="Youden's J statistic calculated best threshold", | |
value=J_statistic_best_threshold, | |
) | |
clf_thresh_predicted_default_status_Jstatistic_gbt = ( | |
apply_threshold_to_probability_values( | |
clf_prediction_prob_df_gbt, | |
J_statistic_best_threshold, | |
) | |
) | |
streamlit_2columns_metrics_df( | |
"# of Predicted Defaults", | |
"# of Predicted Non-Default", | |
clf_thresh_predicted_default_status_Jstatistic_gbt, | |
) | |
streamlit_2columns_metrics_pct_df( | |
"% of Loans Predicted to Default", | |
"% of Loans Predicted not to Default", | |
clf_thresh_predicted_default_status_Jstatistic_gbt, | |
) | |
st.subheader( | |
"Recall and Accuracy Tradeoff with given Probability Threshold" | |
) | |
# Steps | |
# Get list of thresholds | |
# Get default status per threshold | |
# Get classification report per threshold | |
# Get recall, nondef recall, and accuracy per threshold | |
threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist() | |
threshold_default_status_list = default_status_per_threshold( | |
threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"] | |
) | |
thresh_classification_report_dict = ( | |
classification_report_per_threshold( | |
threshold_list, | |
threshold_default_status_list, | |
split_dataset.y_test, | |
) | |
) | |
( | |
thresh_def_recalls_list, | |
thresh_nondef_recalls_list, | |
thresh_accs_list, | |
) = thresh_classification_report_recall_accuracy( | |
thresh_classification_report_dict | |
) | |
namelist = [ | |
"Default Recall", | |
"Non Default Recall", | |
"Accuracy", | |
"Threshold", | |
] | |
df = pd.DataFrame( | |
[ | |
thresh_def_recalls_list, | |
thresh_nondef_recalls_list, | |
thresh_accs_list, | |
threshold_list, | |
], | |
index=namelist, | |
) | |
df = df.T | |
fig2 = px.line( | |
data_frame=df, | |
y=["Default Recall", "Non Default Recall", "Accuracy"], | |
x="Threshold", | |
) | |
fig2.update_layout( | |
title="Recall and Accuracy score Trade-off with Probability Threshold", | |
xaxis_title="Probability Threshold", | |
yaxis_title="Score", | |
) | |
fig2.update_yaxes(range=[0.0, 1.0]) | |
st.plotly_chart(fig2) | |
st.subheader("Acceptance Rate Driven Probability Threshold") | |
# Steps | |
# Set acceptance rate | |
# Get default status per threshold | |
# Get classification report per threshold | |
# Get recall, nondef recall, and accuracy per threshold | |
acceptance_rate = ( | |
st.slider( | |
label="% of loans accepted (acceptance rate):", | |
min_value=0, | |
max_value=100, | |
value=85, | |
key=f"acceptance_rate_{model_name_short}", | |
format="%f%%", | |
) | |
/ 100 | |
) | |
acc_rate_thresh_gbt = np.quantile( | |
clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate | |
) | |
st.write( | |
f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}" | |
) | |
figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"]) | |
figa.update_layout( | |
title="Acceptance Rate Threshold vs. Loans Accepted", | |
xaxis_title="Acceptance Rate Threshold", | |
yaxis_title="Loans Accepted", | |
) | |
figa.update_traces(marker_line_width=1, marker_line_color="white") | |
figa.add_vline( | |
x=acc_rate_thresh_gbt, | |
line_width=3, | |
line_dash="solid", | |
line_color="red", | |
) | |
st.plotly_chart(figa) | |
clf_thresh_predicted_default_status_acceptance_gbt = ( | |
apply_threshold_to_probability_values( | |
clf_prediction_prob_df_gbt, | |
acc_rate_thresh_gbt, | |
) | |
) | |
st.write() | |
st.subheader("Selected Probability Threshold") | |
options = [ | |
"User Defined", | |
"J Statistic Driven", | |
"Acceptance Rate Driven", | |
] | |
prob_thresh_option = st.radio( | |
label="Selected Probability Threshold", | |
options=options, | |
key=f"{model_name_short}_radio_thresh", | |
) | |
if prob_thresh_option == "User Defined": | |
prob_thresh_selected_gbt = threshold_gbt_default | |
predicted_default_status_gbt = ( | |
clf_thresh_predicted_default_status_user_gbt | |
) | |
elif prob_thresh_option == "J Statistic Driven": | |
prob_thresh_selected_gbt = J_statistic_best_threshold | |
predicted_default_status_gbt = ( | |
clf_thresh_predicted_default_status_Jstatistic_gbt | |
) | |
else: | |
prob_thresh_selected_gbt = acc_rate_thresh_gbt | |
predicted_default_status_gbt = ( | |
clf_thresh_predicted_default_status_acceptance_gbt | |
) | |
st.write( | |
f"Selected probability threshold is {prob_thresh_selected_gbt}" | |
) | |
return Threshold( | |
probability_threshold_selected=cast( | |
float, prob_thresh_selected_gbt | |
), | |
predicted_default_status=predicted_default_status_gbt, | |
prediction_probability_df=clf_prediction_prob_df_gbt, | |
) | |
return view | |
decision_tree_threshold_view = make_threshold_view("gbt", "decision tree") | |
logistic_threshold_view = make_threshold_view("lg", "logistic") | |