pkiage commited on
Commit
232e5e5
0 Parent(s):

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ *.png
2
+ *.pyc
3
+ .env
4
+ .envrc
5
+ *.ipynb
6
+ __pycache__
7
+ .vs
README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Credit Risk Modelling
2
+
3
+ # About
4
+
5
+ An interactive tool demonstrating credit risk modelling.
6
+
7
+ ## Built With
8
+
9
+ - [Streamlit](https://streamlit.io/)
10
+
11
+ # References
12
+
13
+ ## Inspiration:
14
+
15
+ [Credit Risk Modeling in Python by Datacamp](https://www.datacamp.com/courses/credit-risk-modeling-in-python)
16
+
17
+ - General Methodology
18
+ - Data
19
+
20
+ [A Gentle Introduction to Threshold-Moving for Imbalanced Classification](https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/)
21
+
22
+ - Selecting optimal threshold using Youden's J statistic
23
+
24
+ ## Political, Economic, Social, Technological, Legal and Environmental(PESTLE):
25
+
26
+ [Europe fit for the Digital Age: Commission proposes new rules and actions for excellence and trust in Artificial Intelligence](https://ec.europa.eu/commission/presscorner/detail/en/ip_21_1682)
27
+
28
+ [LAYING DOWN HARMONISED RULES ON ARTIFICIAL INTELLIGENCE (ARTIFICIAL INTELLIGENCE ACT) AND AMENDING CERTAIN UNION LEGISLATIVE ACTS](https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:52021PC0206&from=EN)
29
+
30
+ "(37) Another area in which the use of AI systems deserves special consideration is the access to and enjoyment of certain essential private and public services and benefits necessary for people to fully participate in society or to improve one’s standard of living. In particular, AI systems used to evaluate the credit score or creditworthiness of natural persons should be classified as high-risk AI systems, since they determine those persons’ access to financial resources or essential services such as housing, electricity, and telecommunication services. AI systems used for this purpose may lead to discrimination of persons or groups and perpetuate historical patterns of discrimination, for example based on racial or ethnic origins, disabilities, age, sexual orientation, or create new forms of discriminatory impacts. Considering the very limited scale of the impact and the available alternatives on the market, it is appropriate to exempt AI systems for the purpose of creditworthiness assessment and credit scoring when put into service by small-scale providers for their own use. Natural persons applying for or receiving public assistance benefits and services from public authorities are typically dependent on those benefits and services and in a vulnerable position in relation to the responsible authorities. If AI systems are used for determining whether such benefits and services should be denied, reduced, revoked or reclaimed by authorities, they may have a significant impact on persons’ livelihood and may infringe their fundamental rights, such as the right to social protection, non-discrimination, human dignity or an effective remedy. Those systems should therefore be classified as high-risk. Nonetheless, this Regulation should not hamper the development and use of innovative approaches in the public administration, which would stand to benefit from a wider use of compliant and safe AI systems, provided that those systems do not entail a high risk to legal and natural persons."
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import OrderedDict
2
+ import streamlit as st
3
+ from data_setup import initialise_data
4
+ from views.decision_tree import decisiontree_view
5
+ from views.logistic import logistic_view
6
+ from views.model_comparison import model_comparison_view
7
+ from views.strategy_table import strategy_table_view
8
+
9
+
10
+ def main():
11
+ currency_options = ["USD", "KES", "GBP"]
12
+
13
+ currency = st.sidebar.selectbox(
14
+ label="What currency will you be using?", options=currency_options
15
+ )
16
+
17
+ st.title("GUI for Credit Risk Modelling")
18
+
19
+ st.title("Data")
20
+
21
+ (_dataset, split_dataset) = initialise_data()
22
+
23
+ st.title("Modelling")
24
+
25
+ model_options = ["Logistic Regression", "Decision Trees"]
26
+
27
+ # Returns list
28
+ models_selected_list = st.sidebar.multiselect(
29
+ label="Select model", options=model_options, default=model_options
30
+ )
31
+
32
+ models_selected_set = set(models_selected_list)
33
+ model_views = OrderedDict()
34
+
35
+ if "Logistic Regression" in models_selected_set:
36
+ logistic_model_view = logistic_view(split_dataset, currency)
37
+ model_views["Logistic Regression"] = logistic_model_view
38
+
39
+ if "Decision Trees" in models_selected_set:
40
+ decision_tree_model_view = decisiontree_view(split_dataset, currency)
41
+ model_views["Decision Trees"] = decision_tree_model_view
42
+
43
+ if models_selected_list:
44
+ model_comparison_view(
45
+ split_dataset,
46
+ model_views,
47
+ )
48
+ strategy_table_view(currency, model_views)
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
common/__init__.py ADDED
File without changes
common/data.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union, cast
2
+ from dataclasses import dataclass
3
+ from sklearn.model_selection import train_test_split
4
+ import pandas as pd
5
+
6
+ from common.util import drop_columns
7
+
8
+
9
+ @dataclass
10
+ class SplitDataset:
11
+ X_test: pd.DataFrame
12
+ X_train: pd.DataFrame
13
+ y_test: pd.Series
14
+ y_train: pd.Series
15
+
16
+ @property
17
+ def X_y_test(self) -> pd.DataFrame:
18
+ return pd.concat(
19
+ cast(
20
+ List[Union[pd.DataFrame, pd.Series]],
21
+ [
22
+ self.X_test.reset_index(drop=True),
23
+ self.y_test.reset_index(drop=True),
24
+ ],
25
+ ),
26
+ axis=1,
27
+ )
28
+
29
+ @property
30
+ def X_y_train(self) -> pd.DataFrame:
31
+ return pd.concat(
32
+ cast(
33
+ List[Union[pd.DataFrame, pd.Series]],
34
+ [
35
+ self.X_train.reset_index(drop=True),
36
+ self.y_train.reset_index(drop=True),
37
+ ],
38
+ ),
39
+ axis=1,
40
+ )
41
+
42
+
43
+ @dataclass
44
+ class Dataset:
45
+ df: pd.DataFrame
46
+ random_state: int
47
+ test_size: int
48
+
49
+ @property
50
+ def y_value(self) -> pd.DataFrame:
51
+ return self.df["loan_status"]
52
+
53
+ @property
54
+ def x_values(self) -> pd.DataFrame:
55
+ return cast(
56
+ pd.DataFrame,
57
+ drop_columns(
58
+ self.df,
59
+ [
60
+ "loan_status",
61
+ "loan_grade_A",
62
+ "loan_grade_B",
63
+ "loan_grade_C",
64
+ "loan_grade_D",
65
+ "loan_grade_E",
66
+ "loan_grade_F",
67
+ "loan_grade_G",
68
+ ],
69
+ ),
70
+ )
71
+
72
+ @property
73
+ def x_values_column_names(self):
74
+ return self.x_values.columns.tolist()
75
+
76
+ def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
77
+ return self.df.filter(columns)
78
+
79
+ def train_test_split(
80
+ self, selected_x_values: pd.DataFrame
81
+ ) -> SplitDataset:
82
+ X_train, X_test, y_train, y_test = train_test_split(
83
+ selected_x_values,
84
+ self.y_value,
85
+ test_size=self.test_size / 100, # since up was given as pct
86
+ random_state=self.random_state,
87
+ )
88
+
89
+ return SplitDataset(
90
+ X_train=cast(pd.DataFrame, X_train),
91
+ X_test=cast(pd.DataFrame, X_test),
92
+ y_train=cast(pd.Series, y_train),
93
+ y_test=cast(pd.Series, y_test),
94
+ )
common/util.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DATA MANIPULATION & ANALYSIS
2
+
3
+ import pickle
4
+ import streamlit as st
5
+
6
+ # Arrays
7
+ import numpy as np
8
+
9
+ # DataFrames and Series
10
+ import pandas as pd
11
+
12
+ # Returns the indices of the maximum values along an axis
13
+ from numpy import argmax
14
+
15
+ # MODELLING
16
+
17
+ # Logistic regression
18
+ from sklearn.linear_model import LogisticRegression
19
+
20
+ from sklearn.model_selection import StratifiedKFold
21
+
22
+ # XGBoosted Decision Trees
23
+ import xgboost as xgb
24
+
25
+
26
+ # REPORTING, EVALUATION, AND INTERPRETATION
27
+
28
+ # Classification report
29
+ from sklearn.metrics import classification_report
30
+
31
+ # Reciever Operator Curve
32
+ from sklearn.metrics import roc_curve
33
+
34
+
35
+ # Evaluate a score by cross-validation
36
+ from sklearn.model_selection import cross_val_score
37
+
38
+
39
+ # # Functions
40
+
41
+
42
+ def drop_columns(df, columns):
43
+ return df.drop(columns, axis=1)
44
+
45
+
46
+ def remove_less_than_0_columns(df, column):
47
+ df[column].dropna()
48
+ return df.loc[(df[column] != 0).any(1)]
49
+
50
+
51
+ def boolean_int_condition_label(df, label_column_name, condition):
52
+ df[label_column_name] = condition
53
+ y = df[label_column_name].astype(int)
54
+ df = drop_columns(df, label_column_name)
55
+ return y, df
56
+
57
+
58
+ @st.cache(suppress_st_warning=True)
59
+ def undersample_training_data(
60
+ df: pd.DataFrame, column_name: str, split_dataset
61
+ ):
62
+ count_nondefault, count_default = split_dataset.X_y_train[
63
+ column_name
64
+ ].value_counts()
65
+
66
+ nondefaults = df[df[column_name] == 0] # 0
67
+
68
+ defaults = df[df[column_name] == 1]
69
+
70
+ under_sample = min(count_nondefault, count_default)
71
+
72
+ nondefaults_under = nondefaults.sample(under_sample)
73
+
74
+ defaults_under = defaults.sample(under_sample)
75
+
76
+ X_y_train_under = pd.concat(
77
+ [
78
+ nondefaults_under.reset_index(drop=True),
79
+ defaults_under.reset_index(drop=True),
80
+ ],
81
+ axis=0,
82
+ )
83
+
84
+ X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
85
+
86
+ y_train_under = X_y_train_under[column_name] # label only
87
+
88
+ class_balance_default = X_y_train_under[column_name].value_counts()
89
+
90
+ return [
91
+ X_train_under,
92
+ y_train_under,
93
+ X_y_train_under,
94
+ class_balance_default,
95
+ ]
96
+
97
+
98
+ def create_coeffient_feature_dictionary_logistic_model(
99
+ logistic_model, training_data
100
+ ):
101
+ return {
102
+ feat: coef
103
+ for coef, feat in zip(
104
+ logistic_model.coef_[0, :], training_data.columns
105
+ )
106
+ }
107
+
108
+
109
+ @st.cache(suppress_st_warning=True)
110
+ def test_variables_logistic(X_train, y_train):
111
+ # Create and fit the logistic regression model
112
+ return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
113
+
114
+
115
+ @st.cache(suppress_st_warning=True)
116
+ def print_coeff_logistic(clf_logistic_model, split_dataset):
117
+ # Dictionary of features and their coefficients
118
+ return create_coeffient_feature_dictionary_logistic_model(
119
+ clf_logistic_model, split_dataset.X_train
120
+ )
121
+
122
+
123
+ @st.cache(suppress_st_warning=True, hash_funcs={
124
+ xgb.XGBClassifier: pickle.dumps
125
+ })
126
+ def test_variables_gbt(X_train, y_train):
127
+ # Using hyperparameters learning_rate and max_depth
128
+ return xgb.XGBClassifier(
129
+ learning_rate=0.1,
130
+ max_depth=7,
131
+ use_label_encoder=False,
132
+ eval_metric="logloss",
133
+ ).fit(X_train, np.ravel(y_train), eval_metric="logloss")
134
+
135
+
136
+ # In[398]:
137
+
138
+
139
+ def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
140
+ model, X, y, threshold, loan_amount_col_name
141
+ ):
142
+ true_status = y.to_frame()
143
+
144
+ loan_amount = X[loan_amount_col_name]
145
+
146
+ clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
147
+
148
+ clf_prediction_prob_df = pd.DataFrame(
149
+ clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
150
+ )
151
+
152
+ clf_thresh_predicted_default_status = (
153
+ clf_prediction_prob_df["PROB_DEFAULT"]
154
+ .apply(lambda x: 1 if x > threshold else 0)
155
+ .rename("PREDICT_DEFAULT_STATUS")
156
+ )
157
+
158
+ return pd.concat(
159
+ [
160
+ true_status.reset_index(drop=True),
161
+ clf_prediction_prob_df.reset_index(drop=True),
162
+ clf_thresh_predicted_default_status.reset_index(drop=True),
163
+ loan_amount.reset_index(drop=True),
164
+ ],
165
+ axis=1,
166
+ )
167
+
168
+
169
+ def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
170
+ fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
171
+ # get the best threshold
172
+ # Youden’s J statistic tpr-fpr
173
+ # Argmax to get the index in
174
+ # thresholds
175
+ return thresholds[argmax(tpr - fpr)]
176
+
177
+
178
+ # In[399]:
179
+
180
+
181
+ # Function that makes dataframe with probability of default, predicted default status based on threshold
182
+ # and actual default status
183
+
184
+
185
+ def model_probability_values_df(model, X):
186
+ return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
187
+
188
+
189
+ def apply_threshold_to_probability_values(probability_values, threshold):
190
+ return (
191
+ probability_values["PROB_DEFAULT"]
192
+ .apply(lambda x: 1 if x > threshold else 0)
193
+ .rename("PREDICT_DEFAULT_STATUS")
194
+ )
195
+
196
+
197
+ @st.cache(suppress_st_warning=True)
198
+ def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
199
+ fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
200
+ # get the best threshold
201
+ J = tpr - fpr # Youden’s J statistic
202
+ ix = argmax(J)
203
+ return thresholds[ix]
204
+
205
+
206
+ # In[401]:
207
+
208
+
209
+ def create_cross_validation_df(
210
+ X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
211
+ ):
212
+ # Test data x and y
213
+ DTrain = xgb.DMatrix(X, label=y)
214
+
215
+ # auc or logloss
216
+ params = {
217
+ "eval_metric": eval_metric,
218
+ "objective": "binary:logistic", # logistic say 0 or 1 for loan status
219
+ "seed": seed,
220
+ }
221
+
222
+ # Create the data frame of cross validations
223
+ cv_df = xgb.cv(
224
+ params,
225
+ DTrain,
226
+ num_boost_round=trees,
227
+ nfold=n_folds,
228
+ early_stopping_rounds=early_stopping_rounds,
229
+ shuffle=True,
230
+ )
231
+
232
+ return [DTrain, cv_df]
233
+
234
+
235
+ # In[450]:
236
+
237
+
238
+ def cross_validation_scores(model, X, y, nfold, score, seed):
239
+ # return cv scores of metric
240
+ return cross_val_score(
241
+ model,
242
+ np.ascontiguousarray(X),
243
+ np.ravel(np.ascontiguousarray(y)),
244
+ cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
245
+ scoring=score,
246
+ )
247
+
248
+
249
+ def default_status_per_threshold(threshold_list, prob_default):
250
+ threshold_default_status_list = []
251
+ for threshold in threshold_list:
252
+ threshold_default_status = prob_default.apply(
253
+ lambda x: 1 if x > threshold else 0
254
+ )
255
+ threshold_default_status_list.append(threshold_default_status)
256
+ return threshold_default_status_list
257
+
258
+
259
+ def classification_report_per_threshold(
260
+ threshold_list, threshold_default_status_list, y_test
261
+ ):
262
+ target_names = ["Non-Default", "Default"]
263
+ classification_report_list = []
264
+ for threshold_default_status in threshold_default_status_list:
265
+ thresh_classification_report = classification_report(
266
+ y_test,
267
+ threshold_default_status,
268
+ target_names=target_names,
269
+ output_dict=True,
270
+ zero_division=0,
271
+ )
272
+ classification_report_list.append(thresh_classification_report)
273
+ # Return threshold classification report dict
274
+ return dict(zip(threshold_list, classification_report_list))
275
+
276
+
277
+ def thresh_classification_report_recall_accuracy(
278
+ thresh_classification_report_dict,
279
+ ):
280
+ thresh_def_recalls_list = []
281
+ thresh_nondef_recalls_list = []
282
+ thresh_accs_list = []
283
+ for x in [*thresh_classification_report_dict]:
284
+ thresh_def_recall = thresh_classification_report_dict[x]["Default"][
285
+ "recall"
286
+ ]
287
+ thresh_def_recalls_list.append(thresh_def_recall)
288
+ thresh_nondef_recall = thresh_classification_report_dict[x][
289
+ "Non-Default"
290
+ ]["recall"]
291
+ thresh_nondef_recalls_list.append(thresh_nondef_recall)
292
+ thresh_accs = thresh_classification_report_dict[x]["accuracy"]
293
+ thresh_accs_list.append(thresh_accs)
294
+ return [
295
+ thresh_def_recalls_list,
296
+ thresh_nondef_recalls_list,
297
+ thresh_accs_list,
298
+ ]
299
+
300
+
301
+ def create_accept_rate_list(start, end, samples):
302
+ return np.linspace(start, end, samples, endpoint=True)
303
+
304
+
305
+ def create_strategyTable_df(
306
+ start, end, samples, actual_probability_predicted_acc_rate, true, currency
307
+ ):
308
+ accept_rates = create_accept_rate_list(start, end, samples)
309
+ thresholds_strat = []
310
+ bad_rates_start = []
311
+ Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
312
+ num_accepted_loans_start = []
313
+
314
+ for rate in accept_rates:
315
+ # Calculate the threshold for the acceptance rate
316
+ thresh = np.quantile(
317
+ actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
318
+ ).round(3)
319
+ # Add the threshold value to the list of thresholds
320
+ thresholds_strat.append(
321
+ np.quantile(
322
+ actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
323
+ ).round(3)
324
+ )
325
+
326
+ # Reassign the loan_status value using the threshold
327
+ actual_probability_predicted_acc_rate[
328
+ "PREDICT_DEFAULT_STATUS"
329
+ ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
330
+ lambda x: 1 if x > thresh else 0
331
+ )
332
+
333
+ # Create a set of accepted loans using this acceptance rate
334
+ accepted_loans = actual_probability_predicted_acc_rate[
335
+ actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
336
+ == 0
337
+ ]
338
+ # Calculate and append the bad rate using the acceptance rate
339
+ bad_rates_start.append(
340
+ np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
341
+ )
342
+ # Accepted loans
343
+ num_accepted_loans_start.append(len(accepted_loans))
344
+
345
+ # Calculate estimated value
346
+ money_accepted_loans = [
347
+ accepted_loans * Avg_Loan_Amnt
348
+ for accepted_loans in num_accepted_loans_start
349
+ ]
350
+
351
+ money_bad_accepted_loans = [
352
+ 2 * money_accepted_loan * bad_rate
353
+ for money_accepted_loan, bad_rate in zip(
354
+ money_accepted_loans, bad_rates_start
355
+ )
356
+ ]
357
+
358
+ zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
359
+ estimated_value = [
360
+ money_accepted_loan - money_bad_accepted_loan
361
+ for money_accepted_loan, money_bad_accepted_loan in zip_object
362
+ ]
363
+
364
+ accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
365
+
366
+ thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
367
+
368
+ bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
369
+
370
+ estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
371
+
372
+ return (
373
+ pd.DataFrame(
374
+ zip(
375
+ accept_rates,
376
+ thresholds_strat,
377
+ bad_rates_start,
378
+ num_accepted_loans_start,
379
+ estimated_value,
380
+ ),
381
+ columns=[
382
+ "Acceptance Rate",
383
+ "Threshold",
384
+ "Bad Rate",
385
+ "Num Accepted Loans",
386
+ f"Estimated Value ({currency})",
387
+ ],
388
+ )
389
+ .sort_values(by="Acceptance Rate", axis=0, ascending=False)
390
+ .reset_index(drop=True)
391
+ )
common/views.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import OrderedDict
2
+ import streamlit as st # works on command prompt
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ import xgboost as xgb
7
+ from sklearn.metrics import (
8
+ roc_curve,
9
+ )
10
+ from sklearn.calibration import calibration_curve
11
+ from xgboost import plot_tree
12
+ from views.typing import ModelView
13
+
14
+
15
+ def plot_logistic_coeff_barh(coef_dict, x, y):
16
+ fig = plt.figure(figsize=(x, y))
17
+ coef_dict_sorted = dict(
18
+ sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
19
+ )
20
+ plt.barh(*zip(*coef_dict_sorted.items()))
21
+ return fig
22
+
23
+
24
+ def print_negative_coefficients_logistic_model(coef_dict):
25
+ # Equal to or less than 0
26
+ NegativeCoefficients = dict(
27
+ filter(lambda x: x[1] <= 0.0, coef_dict.items())
28
+ )
29
+
30
+ NegativeCoefficientsSorted = sorted(
31
+ NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
32
+ )
33
+ text = (
34
+ "\n\nFeatures the model found to be negatively correlated with probability of default are:"
35
+ "\n{negative_features}:"
36
+ )
37
+ st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
38
+ st.markdown(type(NegativeCoefficientsSorted))
39
+ st.markdown(NegativeCoefficients.items())
40
+
41
+
42
+ def print_positive_coefficients_logistic_model(coef_dict):
43
+ # Equal to or greater than 0
44
+ PositiveCoefficients = dict(
45
+ filter(lambda x: x[1] >= 0.0, coef_dict.items())
46
+ )
47
+
48
+ PositiveCoefficientsSorted = sorted(
49
+ PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
50
+ )
51
+ text = (
52
+ "\n\nFeatures the model found to be positively correlated with probability of default are:"
53
+ "\n{positive_features}:"
54
+ )
55
+ st.markdown(text.format(positive_features=PositiveCoefficientsSorted))
56
+
57
+
58
+ def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
59
+ axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
60
+ fig1 = axobject1.figure
61
+ st.write("Feature Importance Plot (Gradient Boosted Tree)")
62
+ fig1.set_size_inches(barxsize, barysize)
63
+ return fig1
64
+
65
+
66
+ def download_importance_gbt(fig1, barxsize, barysize):
67
+ if st.button(
68
+ "Download Feature Importance Plot as png (Gradient Boosted Tree)"
69
+ ):
70
+ dpisize = max(barxsize, barysize)
71
+ plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
72
+ fig1.set_size_inches(barxsize, barysize)
73
+
74
+
75
+ def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
76
+ plot_tree(clf_gbt_model)
77
+ fig2 = plt.gcf()
78
+ fig2.set_size_inches(treexsize, treeysize)
79
+ return fig2
80
+
81
+
82
+ def download_tree_gbt(treexsize, treeysize):
83
+ if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
84
+ dpisize = max(treexsize, treeysize)
85
+ plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
86
+
87
+
88
+ def cross_validation_graph(cv, eval_metric, trees):
89
+
90
+ # Plot the test AUC scores for each iteration
91
+ fig = plt.figure()
92
+ plt.plot(cv[cv.columns[2]])
93
+ plt.title(
94
+ "Test {eval_metric} Score Over {it_numbr} Iterations".format(
95
+ eval_metric=eval_metric, it_numbr=trees
96
+ )
97
+ )
98
+ plt.xlabel("Iteration Number")
99
+ plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
100
+ return fig
101
+
102
+
103
+ def recall_accuracy_threshold_tradeoff_fig(
104
+ widthsize,
105
+ heightsize,
106
+ threshold_list,
107
+ thresh_def_recalls_list,
108
+ thresh_nondef_recalls_list,
109
+ thresh_accs_list,
110
+ ):
111
+ fig = plt.figure(figsize=(widthsize, heightsize))
112
+ plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
113
+ plt.plot(
114
+ threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
115
+ )
116
+ plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
117
+ plt.xlabel("Probability Threshold")
118
+ plt.ylabel("Score")
119
+ plt.xlim(0, 1)
120
+ plt.ylim(0, 1)
121
+ plt.legend()
122
+ plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
123
+ plt.grid(False)
124
+ return fig
125
+
126
+
127
+ def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
128
+ colors = ["blue", "green"]
129
+ fig = plt.figure()
130
+ for color_idx, (model_name, model_view) in enumerate(model_views.items()):
131
+ fpr, tpr, _thresholds = roc_curve(
132
+ y, model_view.prediction_probability_df
133
+ )
134
+ plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
135
+ plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
136
+ model_names = list(model_views.keys())
137
+ if not model_names:
138
+ model_name_str = "None"
139
+ elif len(model_names) == 1:
140
+ model_name_str = model_names[0]
141
+ else:
142
+ model_name_str = " and ".join(
143
+ [", ".join(model_names[:-1]), model_names[-1]]
144
+ )
145
+ plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
146
+ plt.xlabel("False Positive Rate (FP Rate)")
147
+ plt.ylabel("True Positive Rate (TP Rate)")
148
+ plt.legend()
149
+ plt.grid(False)
150
+ plt.xlim(0, 1)
151
+ plt.ylim(0, 1)
152
+ return fig
153
+
154
+
155
+ def calibration_curve_report_commented_n(
156
+ y, model_views: OrderedDict[str, ModelView], bins: int
157
+ ):
158
+ fig = plt.figure()
159
+ for model_name, model_view in model_views.items():
160
+ frac_of_pos, mean_pred_val = calibration_curve(
161
+ y,
162
+ model_view.prediction_probability_df,
163
+ n_bins=bins,
164
+ normalize=True,
165
+ )
166
+ plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
167
+
168
+ # Create the calibration curve plot with the guideline
169
+ plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
170
+
171
+ plt.ylabel("Fraction of positives")
172
+ plt.xlabel("Average Predicted Probability")
173
+ plt.title("Calibration Curve")
174
+ plt.legend()
175
+ plt.grid(False)
176
+ plt.xlim(0, 1)
177
+ plt.ylim(0, 1)
178
+ return fig
179
+
180
+
181
+ def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
182
+ # Probability distribution
183
+ probability_stat_distribution = probability_default.describe()
184
+
185
+ # Acceptance rate threshold
186
+ acc_rate_thresh = np.quantile(probability_default, acceptancerate)
187
+ fig = plt.figure()
188
+
189
+ plt.hist(
190
+ probability_default,
191
+ color="blue",
192
+ bins=bins,
193
+ histtype="bar",
194
+ ec="white",
195
+ )
196
+
197
+ # Add a reference line to the plot for the threshold
198
+ plt.axvline(x=acc_rate_thresh, color="red")
199
+ plt.title("Acceptance Rate Thershold")
200
+
201
+ return (
202
+ fig,
203
+ probability_stat_distribution,
204
+ acc_rate_thresh,
205
+ )
206
+
207
+
208
+ def streamlit_2columns_metrics_pct_df(
209
+ column1name_label: str,
210
+ column2name_label: str,
211
+ df: pd.DataFrame,
212
+ ):
213
+ (
214
+ column1name,
215
+ column2name,
216
+ ) = st.columns(2)
217
+
218
+ with column1name:
219
+ st.metric(
220
+ label=column1name_label,
221
+ value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
222
+ delta=None,
223
+ delta_color="normal",
224
+ )
225
+
226
+ with column2name:
227
+ st.metric(
228
+ label=column2name_label,
229
+ value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
230
+ delta=None,
231
+ delta_color="normal",
232
+ )
233
+
234
+
235
+ def streamlit_2columns_metrics_df(
236
+ column1name_label: str,
237
+ column2name_label: str,
238
+ df: pd.DataFrame,
239
+ ):
240
+ (
241
+ column1name,
242
+ column2name,
243
+ ) = st.columns(2)
244
+
245
+ with column1name:
246
+ st.metric(
247
+ label=column1name_label,
248
+ value=df.value_counts().get(1),
249
+ delta=None,
250
+ delta_color="normal",
251
+ )
252
+
253
+ with column2name:
254
+ st.metric(
255
+ label=column2name_label,
256
+ value=df.value_counts().get(0),
257
+ delta=None,
258
+ delta_color="normal",
259
+ )
260
+
261
+
262
+ def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
263
+ (
264
+ column1name,
265
+ column2name,
266
+ ) = st.columns(2)
267
+
268
+ with column1name:
269
+ st.metric(
270
+ label="Rows",
271
+ value=df.shape[0],
272
+ delta=None,
273
+ delta_color="normal",
274
+ )
275
+
276
+ with column2name:
277
+ st.metric(
278
+ label="Columns",
279
+ value=df.shape[1],
280
+ delta=None,
281
+ delta_color="normal",
282
+ )
283
+
284
+
285
+ def streamlit_2columns_metrics_pct_series(
286
+ column1name_label: str,
287
+ column2name_label: str,
288
+ series: pd.Series,
289
+ ):
290
+ (
291
+ column1name,
292
+ column2name,
293
+ ) = st.columns(2)
294
+ with column1name:
295
+ st.metric(
296
+ label=column1name_label,
297
+ value="{:.0%}".format(series.get(1) / series.sum()),
298
+ delta=None,
299
+ delta_color="normal",
300
+ )
301
+
302
+ with column2name:
303
+ st.metric(
304
+ label=column2name_label,
305
+ value="{:.0%}".format(series.get(0) / series.sum()),
306
+ delta=None,
307
+ delta_color="normal",
308
+ )
309
+
310
+
311
+ def streamlit_2columns_metrics_series(
312
+ column1name_label: str,
313
+ column2name_label: str,
314
+ series: pd.Series,
315
+ ):
316
+ (
317
+ column1name,
318
+ column2name,
319
+ ) = st.columns(2)
320
+ with column1name:
321
+ st.metric(
322
+ label=column1name_label,
323
+ value=series.get(1),
324
+ delta=None,
325
+ delta_color="normal",
326
+ )
327
+
328
+ with column2name:
329
+ st.metric(
330
+ label=column2name_label,
331
+ value=series.get(0),
332
+ delta=None,
333
+ delta_color="normal",
334
+ )
335
+
336
+
337
+ def streamlit_chart_setting_height_width(
338
+ title: str,
339
+ default_widthvalue: int,
340
+ default_heightvalue: int,
341
+ widthkey: str,
342
+ heightkey: str,
343
+ ):
344
+ with st.expander(title):
345
+
346
+ lbarx_col, lbary_col = st.columns(2)
347
+
348
+ with lbarx_col:
349
+ width_size = st.number_input(
350
+ label="Width in inches:",
351
+ value=default_widthvalue,
352
+ key=widthkey,
353
+ )
354
+
355
+ with lbary_col:
356
+ height_size = st.number_input(
357
+ label="Height in inches:",
358
+ value=default_heightvalue,
359
+ key=heightkey,
360
+ )
361
+ return width_size, height_size
data/processed/cr_loan_w2.csv ADDED
The diff for this file is too large to render. See raw diff
 
data_setup.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, cast
2
+
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ from common.data import Dataset, SplitDataset
7
+ from common.util import (
8
+ undersample_training_data,
9
+ )
10
+ from common.views import (
11
+ streamlit_2columns_metrics_df_shape,
12
+ streamlit_2columns_metrics_series,
13
+ streamlit_2columns_metrics_pct_series,
14
+ streamlit_2columns_metrics_df,
15
+ streamlit_2columns_metrics_pct_df,
16
+ )
17
+
18
+
19
+ # Initialize dataframe session state
20
+ def initialise_data() -> Tuple[Dataset, SplitDataset]:
21
+ if "input_data_frame" not in st.session_state:
22
+ st.session_state.input_data_frame = pd.read_csv(
23
+ r"./data/processed/cr_loan_w2.csv"
24
+ )
25
+ if "dataset" not in st.session_state:
26
+ df = cast(pd.DataFrame, st.session_state.input_data_frame)
27
+ dataset = Dataset(
28
+ df=df,
29
+ random_state=123235,
30
+ test_size=40,
31
+ )
32
+ st.session_state.dataset = dataset
33
+ else:
34
+ dataset = st.session_state.dataset
35
+
36
+ st.write(
37
+ "Assuming data is already cleaned and relevant features (predictors) added."
38
+ )
39
+
40
+ with st.expander("Input Dataframe (X and y)"):
41
+ st.dataframe(dataset.df)
42
+ streamlit_2columns_metrics_df_shape(dataset.df)
43
+
44
+ st.header("Predictors")
45
+
46
+ possible_columns = dataset.x_values_column_names
47
+
48
+ selected_columns = st.sidebar.multiselect(
49
+ label="Select Predictors",
50
+ options=possible_columns,
51
+ default=possible_columns,
52
+ )
53
+
54
+ selected_x_values = dataset.x_values_filtered_columns(selected_columns)
55
+
56
+ st.sidebar.metric(
57
+ label="# of Predictors Selected",
58
+ value=selected_x_values.shape[1],
59
+ delta=None,
60
+ delta_color="normal",
61
+ )
62
+ with st.expander("Predictors Dataframe (X)"):
63
+ st.dataframe(selected_x_values)
64
+ streamlit_2columns_metrics_df_shape(selected_x_values)
65
+
66
+ # 40% of data used for training
67
+ # 14321 as random seed for reproducability
68
+
69
+ st.header("Split Testing and Training Data")
70
+
71
+ test_size_slider_col, seed_col = st.columns(2)
72
+
73
+ with test_size_slider_col:
74
+ # Initialize test size
75
+ dataset.test_size = st.slider(
76
+ label="Test Size Percentage of Input Dataframe:",
77
+ min_value=0,
78
+ max_value=100,
79
+ value=dataset.test_size,
80
+ key="init_test_size",
81
+ format="%f%%",
82
+ )
83
+
84
+ with seed_col:
85
+ dataset.random_state = int(
86
+ st.number_input(label="Random State:", value=dataset.random_state)
87
+ )
88
+
89
+ split_dataset = dataset.train_test_split(selected_x_values)
90
+
91
+ # Series
92
+ true_status = split_dataset.y_test.to_frame().value_counts()
93
+
94
+ st.sidebar.metric(
95
+ label="Testing Data # of Actual Default (=1)",
96
+ value=true_status.get(1),
97
+ )
98
+
99
+ st.sidebar.metric(
100
+ label="Testing Data % of Actual Default",
101
+ value="{:.0%}".format(true_status.get(1) / true_status.sum()),
102
+ )
103
+
104
+ st.sidebar.metric(
105
+ label="Testing Data # of Actual Non-Default (=0)",
106
+ value=true_status.get(0),
107
+ )
108
+
109
+ st.sidebar.metric(
110
+ label="Testing Data % of Actual Non-Default",
111
+ value="{:.0%}".format(true_status.get(0) / true_status.sum()),
112
+ )
113
+
114
+ # Concat the testing sets
115
+ X_y_test = split_dataset.X_y_test
116
+ X_y_train = split_dataset.X_y_train
117
+
118
+ with st.expander("Testing Dataframe (X and y)"):
119
+ st.dataframe(X_y_test)
120
+ streamlit_2columns_metrics_df_shape(X_y_test)
121
+
122
+ streamlit_2columns_metrics_series(
123
+ "# Defaults(=1) (Testing Data)",
124
+ "# Non-Defaults(=0) (Testing Data)",
125
+ true_status,
126
+ )
127
+
128
+ streamlit_2columns_metrics_pct_series(
129
+ "% Defaults (Testing Data)",
130
+ "% Non-Defaults (Testing Data)",
131
+ true_status,
132
+ )
133
+
134
+ st.header("Training Data")
135
+
136
+ with st.expander("Training Dataframe (X and y)"):
137
+ st.dataframe(X_y_train)
138
+ streamlit_2columns_metrics_df_shape(X_y_train)
139
+
140
+ st.subheader("Class Count")
141
+
142
+ streamlit_2columns_metrics_df(
143
+ "# Defaults (Training Data Class Balance Check)",
144
+ "# Non-Defaults (Training Data Class Balance Check)",
145
+ split_dataset.y_train,
146
+ )
147
+
148
+ streamlit_2columns_metrics_pct_df(
149
+ "% Defaults (Training Data Class Balance Check)",
150
+ "% Non-Defaults (Training Data Class Balance Check)",
151
+ split_dataset.y_train,
152
+ )
153
+
154
+ balance_the_classes = st.radio(
155
+ label="Balance the Classes:", options=("Yes", "No")
156
+ )
157
+
158
+ if balance_the_classes == "Yes":
159
+ st.subheader("Balanced Classes (by Undersampling)")
160
+
161
+ (
162
+ split_dataset.X_train,
163
+ split_dataset.y_train,
164
+ _X_y_train,
165
+ class_balance_default,
166
+ ) = undersample_training_data(X_y_train, "loan_status", split_dataset)
167
+
168
+ streamlit_2columns_metrics_series(
169
+ "# Defaults (Training Data with Class Balance)",
170
+ "# Non-Defaults (Training Data with Class Balance)",
171
+ class_balance_default,
172
+ )
173
+
174
+ streamlit_2columns_metrics_pct_series(
175
+ "% of Defaults (Training Data with Class Balance)",
176
+ "% of Non-Defaults (Training Data with Class Balance)",
177
+ class_balance_default,
178
+ )
179
+
180
+ return dataset, split_dataset
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "credit_risk_modelling"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Your Name <you@example.com>"]
6
+
7
+ [tool.poetry.dependencies]
8
+ python = ">=3.8,<3.11"
9
+ pandas = "^1.4.0"
10
+ numpy = "^1.22.1"
11
+ matplotlib = "^3.5.1"
12
+ seaborn = "^0.11.2"
13
+ notebook = "^6.4.7"
14
+ scikit-learn = "^1.0.2"
15
+ xgboost = "^1.5.2"
16
+ streamlit = "^1.4.0"
17
+ plotly = "^5.5.0"
18
+ graphviz = "^0.19.1"
19
+
20
+ [tool.poetry.dev-dependencies]
21
+ pytest = "^6.2.5"
22
+ black = "^21.12b0"
23
+ flake8 = "^4.0.1"
24
+
25
+ [tool.black]
26
+ line-length = 79
27
+
28
+ [build-system]
29
+ requires = ["poetry-core>=1.0.0"]
30
+ build-backend = "poetry.core.masonry.api"
views/__init__.py ADDED
File without changes
views/decision_tree.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from common.data import SplitDataset
2
+ import streamlit as st
3
+ from common.util import (
4
+ test_variables_gbt,
5
+ )
6
+ from common.views import (
7
+ streamlit_chart_setting_height_width,
8
+ plot_importance_gbt,
9
+ plot_tree_gbt,
10
+ download_importance_gbt,
11
+ download_tree_gbt,
12
+ )
13
+ from views.typing import ModelView
14
+ from views.threshold import decision_tree_threshold_view
15
+ from views.evaluation import decision_tree_evaluation_view
16
+
17
+
18
+ def decisiontree_view(split_dataset: SplitDataset, currency: str):
19
+ st.header("Decision Trees")
20
+
21
+ clf_gbt_model = test_variables_gbt(
22
+ split_dataset.X_train, split_dataset.y_train
23
+ )
24
+
25
+ st.subheader("Decision Tree Feature Importance")
26
+
27
+ (barxsize, barysize,) = streamlit_chart_setting_height_width(
28
+ "Chart Settings", 10, 15, "barxsize", "barysize"
29
+ )
30
+
31
+ fig1 = plot_importance_gbt(clf_gbt_model, barxsize, barysize)
32
+
33
+ st.pyplot(fig1)
34
+
35
+ download_importance_gbt(fig1, barxsize, barysize)
36
+
37
+ st.subheader("Decision Tree Structure")
38
+
39
+ (treexsize, treeysize,) = streamlit_chart_setting_height_width(
40
+ "Chart Settings", 15, 10, "treexsize", "treeysize"
41
+ )
42
+
43
+ fig2 = plot_tree_gbt(treexsize, treeysize, clf_gbt_model)
44
+
45
+ st.pyplot(fig2)
46
+
47
+ download_tree_gbt(treexsize, treeysize)
48
+ st.markdown(
49
+ "Note: The downloaded decision tree plot chart in png has higher resolution than that displayed here."
50
+ )
51
+
52
+ threshold = decision_tree_threshold_view(clf_gbt_model, split_dataset)
53
+
54
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
55
+ decision_tree_evaluation_view(
56
+ clf_gbt_model,
57
+ split_dataset,
58
+ currency,
59
+ threshold.probability_threshold_selected,
60
+ threshold.predicted_default_status,
61
+ )
62
+ )
63
+
64
+ return ModelView(
65
+ model=clf_gbt_model,
66
+ trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
67
+ probability_threshold_selected=threshold.probability_threshold_selected,
68
+ predicted_default_status=threshold.predicted_default_status,
69
+ prediction_probability_df=threshold.prediction_probability_df,
70
+ )
views/evaluation.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+ import pandas as pd
3
+ import streamlit as st
4
+ import numpy as np
5
+ from sklearn.metrics import (
6
+ classification_report,
7
+ confusion_matrix,
8
+ )
9
+ from sklearn.linear_model import LogisticRegression
10
+ from xgboost.sklearn import XGBClassifier
11
+ from common.data import SplitDataset
12
+ from common.util import (
13
+ create_cross_validation_df,
14
+ cross_validation_scores,
15
+ get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
16
+ )
17
+ from common.views import (
18
+ cross_validation_graph,
19
+ )
20
+
21
+
22
+ def make_evaluation_view(
23
+ model_name_short: str,
24
+ model_name_generic: str,
25
+ ):
26
+ def view(
27
+ clf_gbt_model: Union[XGBClassifier, LogisticRegression],
28
+ split_dataset: SplitDataset,
29
+ currency: str,
30
+ prob_thresh_selected,
31
+ predicted_default_status,
32
+ ):
33
+ st.header(f"Model Evaluation - {model_name_generic}")
34
+
35
+ st.subheader("Cross Validation")
36
+
37
+ st.write("Shows how our model will perform as new loans come in.")
38
+ st.write(
39
+ "If evaluation metric for test and train set improve as models \
40
+ train on each fold suggests performance will be stable."
41
+ )
42
+
43
+ st.write(f"XGBoost cross validation test:")
44
+
45
+ stcol_seed, stcol_eval_metric = st.columns(2)
46
+
47
+ with stcol_seed:
48
+ cv_seed = int(
49
+ st.number_input(
50
+ label="Random State Seed for Cross Validation:",
51
+ value=123235,
52
+ key=f"cv_seed_{model_name_short}",
53
+ )
54
+ )
55
+
56
+ with stcol_eval_metric:
57
+ eval_metric = st.selectbox(
58
+ label="Select evaluation metric",
59
+ options=[
60
+ "auc",
61
+ "aucpr",
62
+ "rmse",
63
+ "mae",
64
+ "logloss",
65
+ "error",
66
+ "merror",
67
+ "mlogloss",
68
+ ],
69
+ key=f"eval_metric_{model_name_short}",
70
+ )
71
+
72
+ stcol_trees, stcol_eval_nfold, stcol_earlystoppingrounds = st.columns(
73
+ 3
74
+ )
75
+
76
+ with stcol_trees:
77
+ trees = int(
78
+ st.number_input(
79
+ label="Number of trees",
80
+ value=5,
81
+ key=f"trees_{model_name_short}",
82
+ )
83
+ )
84
+
85
+ with stcol_eval_nfold:
86
+ nfolds = int(
87
+ st.number_input(
88
+ label="Number of folds",
89
+ value=5,
90
+ key=f"nfolds_{model_name_short}",
91
+ )
92
+ )
93
+
94
+ with stcol_earlystoppingrounds:
95
+ early_stopping_rounds = int(
96
+ st.number_input(
97
+ label="Early stopping rounds",
98
+ value=10,
99
+ key=f"early_stopping_rounds_{model_name_short}",
100
+ )
101
+ )
102
+
103
+ DTrain, cv_df = create_cross_validation_df(
104
+ split_dataset.X_test,
105
+ split_dataset.y_test,
106
+ eval_metric,
107
+ cv_seed,
108
+ trees,
109
+ nfolds,
110
+ early_stopping_rounds,
111
+ )
112
+
113
+ st.write(cv_df)
114
+
115
+ scoring_options = [
116
+ "roc_auc",
117
+ "accuracy",
118
+ "precision",
119
+ "recall",
120
+ "f1",
121
+ "jaccard",
122
+ ]
123
+
124
+ overfit_test = st.radio(
125
+ label="Overfit test:",
126
+ options=("No", "Yes"),
127
+ key=f"overfit_test_{model_name_short}",
128
+ )
129
+
130
+ if overfit_test == "Yes":
131
+ st.write("Overfit test:")
132
+ iterations = int(
133
+ st.number_input(
134
+ label="Number of folds (iterations)",
135
+ value=500,
136
+ key=f"iterations_{model_name_short}",
137
+ )
138
+ )
139
+
140
+ DTrain, cv_df_it = create_cross_validation_df(
141
+ split_dataset.X_test,
142
+ split_dataset.y_test,
143
+ eval_metric,
144
+ cv_seed,
145
+ iterations,
146
+ nfolds,
147
+ iterations,
148
+ )
149
+
150
+ fig_it = cross_validation_graph(cv_df_it, eval_metric, iterations)
151
+ st.pyplot(fig_it)
152
+
153
+ st.write("Sklearn cross validation test:")
154
+ stcol_scoringmetric, st_nfold = st.columns(2)
155
+
156
+ with stcol_scoringmetric:
157
+ score_metric = st.selectbox(
158
+ label="Select score",
159
+ options=scoring_options,
160
+ key=f"stcol_scoringmetric_{model_name_short}",
161
+ )
162
+
163
+ with st_nfold:
164
+ nfolds_score = int(
165
+ st.number_input(
166
+ label="Number of folds",
167
+ value=5,
168
+ key=f"st_nfold_{model_name_short}",
169
+ )
170
+ )
171
+
172
+ cv_scores = cross_validation_scores(
173
+ clf_gbt_model,
174
+ split_dataset.X_test,
175
+ split_dataset.y_test,
176
+ nfolds_score,
177
+ score_metric,
178
+ cv_seed,
179
+ )
180
+
181
+ stcol_vals, stcol_mean, st_std = st.columns(3)
182
+
183
+ with stcol_vals:
184
+ st.markdown(f"{score_metric} scores:")
185
+ st.write(
186
+ pd.DataFrame(
187
+ cv_scores,
188
+ columns=[score_metric],
189
+ )
190
+ )
191
+
192
+ with stcol_mean:
193
+ st.metric(
194
+ label=f"Average {score_metric} score ",
195
+ value="{:.4f}".format(cv_scores.mean()),
196
+ delta=None,
197
+ delta_color="normal",
198
+ )
199
+
200
+ with st_std:
201
+ st.metric(
202
+ label=f"{score_metric} standard deviation (+/-)",
203
+ value="{:.4f}".format(cv_scores.std()),
204
+ delta=None,
205
+ delta_color="normal",
206
+ )
207
+
208
+ st.subheader("Classification Report")
209
+
210
+ target_names = ["Non-Default", "Default"]
211
+
212
+ classification_report_dict = classification_report(
213
+ split_dataset.y_test,
214
+ predicted_default_status,
215
+ target_names=target_names,
216
+ output_dict=True,
217
+ )
218
+
219
+ (
220
+ stcol_defaultpres,
221
+ stcol_defaultrecall,
222
+ stcol_defaultf1score,
223
+ stcol_f1score,
224
+ ) = st.columns(4)
225
+ with stcol_defaultpres:
226
+ st.metric(
227
+ label="Default Precision",
228
+ value="{:.0%}".format(
229
+ classification_report_dict["Default"]["precision"]
230
+ ),
231
+ delta=None,
232
+ delta_color="normal",
233
+ )
234
+
235
+ with stcol_defaultrecall:
236
+ st.metric(
237
+ label="Default Recall",
238
+ value="{:.0%}".format(
239
+ classification_report_dict["Default"]["recall"]
240
+ ),
241
+ delta=None,
242
+ delta_color="normal",
243
+ )
244
+
245
+ with stcol_defaultf1score:
246
+ st.metric(
247
+ label="Default F1 Score",
248
+ value="{:.2f}".format(
249
+ classification_report_dict["Default"]["f1-score"]
250
+ ),
251
+ delta=None,
252
+ delta_color="normal",
253
+ )
254
+
255
+ with stcol_f1score:
256
+ st.metric(
257
+ label="Macro avg F1 Score (Model F1 Score):",
258
+ value="{:.2f}".format(
259
+ classification_report_dict["macro avg"]["f1-score"]
260
+ ),
261
+ delta=None,
262
+ delta_color="normal",
263
+ )
264
+
265
+ with st.expander("Classification Report Dictionary:"):
266
+ st.write(classification_report_dict)
267
+
268
+ st.markdown(
269
+ f'Default precision: {"{:.0%}".format(classification_report_dict["Default"]["precision"])} of loans predicted as default were actually default.'
270
+ )
271
+
272
+ st.markdown(
273
+ f'Default recall: {"{:.0%}".format(classification_report_dict["Default"]["recall"])} of true defaults predicted correctly.'
274
+ )
275
+
276
+ f1_gap = 1 - classification_report_dict["Default"]["f1-score"]
277
+ st.markdown(
278
+ f'Default F1 score: {"{:.2f}".format(classification_report_dict["Default"]["f1-score"])}\
279
+ is {"{:.2f}".format(f1_gap)} away from perfect precision and recall (no false positive rate).'
280
+ )
281
+
282
+ st.markdown(
283
+ f'macro avg F1 score: {"{:.2f}".format(classification_report_dict["macro avg"]["f1-score"])} is the models F1 score.'
284
+ )
285
+
286
+ st.subheader("Confusion Matrix")
287
+ confuctiomatrix_dict = confusion_matrix(
288
+ split_dataset.y_test, predicted_default_status
289
+ )
290
+
291
+ tn, fp, fn, tp = confusion_matrix(
292
+ split_dataset.y_test, predicted_default_status
293
+ ).ravel()
294
+
295
+ with st.expander(
296
+ "Confusion matrix (column name = classification model prediction, row name = true status, values = number of loans"
297
+ ):
298
+ st.write(confuctiomatrix_dict)
299
+
300
+ st.markdown(
301
+ f'{tp} ,\
302
+ {"{:.0%}".format(tp / len(predicted_default_status))} \
303
+ true positives (defaults correctly predicted as defaults).'
304
+ )
305
+
306
+ st.markdown(
307
+ f'{fp} ,\
308
+ {"{:.0%}".format(fp / len(predicted_default_status))} \
309
+ false positives (non-defaults incorrectly predicted as defaults).'
310
+ )
311
+
312
+ st.markdown(
313
+ f'{fn} ,\
314
+ {"{:.0%}".format(fn / len(predicted_default_status))} \
315
+ false negatives (defaults incorrectly predicted as non-defaults).'
316
+ )
317
+
318
+ st.markdown(
319
+ f'{tn} ,\
320
+ {"{:.0%}".format(tn / len(predicted_default_status))} \
321
+ true negatives (non-defaults correctly predicted as non-defaults).'
322
+ )
323
+
324
+ st.subheader("Bad Rate")
325
+
326
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
327
+ get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
328
+ clf_gbt_model,
329
+ split_dataset.X_test,
330
+ split_dataset.y_test,
331
+ prob_thresh_selected,
332
+ "loan_amnt",
333
+ )
334
+ )
335
+
336
+ with st.expander(
337
+ "Loan Status, Probability of Default, & Loan Amount DataFrame"
338
+ ):
339
+ st.write(df_trueStatus_probabilityDefault_threshStatus_loanAmount)
340
+
341
+ accepted_loans = (
342
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
343
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
344
+ "PREDICT_DEFAULT_STATUS"
345
+ ]
346
+ == 0
347
+ ]
348
+ )
349
+
350
+ bad_rate = (
351
+ np.sum(accepted_loans["loan_status"])
352
+ / accepted_loans["loan_status"].count()
353
+ )
354
+
355
+ with st.expander("Loan Amount Summary Statistics"):
356
+ st.write(
357
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
358
+ "loan_amnt"
359
+ ].describe()
360
+ )
361
+
362
+ avg_loan = np.mean(
363
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
364
+ "loan_amnt"
365
+ ]
366
+ )
367
+
368
+ crosstab_df = pd.crosstab(
369
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
370
+ "loan_status"
371
+ ], # row label
372
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount[
373
+ "PREDICT_DEFAULT_STATUS"
374
+ ],
375
+ ).apply(
376
+ lambda x: x * avg_loan, axis=0
377
+ ) # column label
378
+
379
+ with st.expander(
380
+ "Cross tabulation (column name = classification model prediction, row name = true status, values = number of loans * average loan value"
381
+ ):
382
+ st.write(crosstab_df)
383
+
384
+ st.write(
385
+ f'Bad rate: {"{:.2%}".format(bad_rate)} of all the loans the model accepted (classified as non-default) from the test set were actually defaults.'
386
+ )
387
+
388
+ st.write(
389
+ f'Estimated value of the bad rate is {currency} {"{:,.2f}".format(crosstab_df[0][1])}.'
390
+ )
391
+
392
+ st.write(
393
+ f'Total estimated value of actual non-default loans is {currency} {"{:,.2f}".format(crosstab_df[0][0]+crosstab_df[0][1])}'
394
+ )
395
+
396
+ st.write(
397
+ f'Estimated value of loans incorrectly predicted as default is {currency} {"{:,.2f}".format(crosstab_df[1][0])}'
398
+ )
399
+
400
+ st.write(
401
+ f'Estimated value of loans correctly predicted as defaults is {currency} {"{:,.2f}".format(crosstab_df[1][1])}'
402
+ )
403
+
404
+ return df_trueStatus_probabilityDefault_threshStatus_loanAmount
405
+
406
+ return view
407
+
408
+
409
+ decision_tree_evaluation_view = make_evaluation_view("gbt", "Decision Tree")
410
+ logistic_evaluation_view = make_evaluation_view("lg", "Logistic Regression")
views/logistic.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from common.data import SplitDataset
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ from views.threshold import logistic_threshold_view
6
+ from views.evaluation import logistic_evaluation_view
7
+ from common.util import (
8
+ test_variables_logistic,
9
+ print_coeff_logistic,
10
+ model_probability_values_df,
11
+ apply_threshold_to_probability_values,
12
+ )
13
+ from common.views import (
14
+ streamlit_2columns_metrics_df,
15
+ streamlit_2columns_metrics_pct_df,
16
+ )
17
+ from views.typing import ModelView
18
+
19
+
20
+ def logistic_view(split_dataset: SplitDataset, currency: str) -> ModelView:
21
+ # ### Test and create variables logically
22
+
23
+ st.header("Logistic Regression")
24
+
25
+ clf_logistic_model = test_variables_logistic(
26
+ split_dataset.X_train, split_dataset.y_train
27
+ )
28
+
29
+ st.metric(
30
+ label="# of Coefficients in Logistic Regression",
31
+ value=clf_logistic_model.n_features_in_,
32
+ delta=None,
33
+ delta_color="normal",
34
+ )
35
+
36
+ coef_dict = print_coeff_logistic(clf_logistic_model, split_dataset)
37
+
38
+ st.subheader("Logistic Regression Coefficient Values")
39
+
40
+ coef_dict_sorted = dict(
41
+ sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
42
+ )
43
+
44
+ data_items = coef_dict_sorted.items()
45
+ data_list = list(data_items)
46
+
47
+ df = pd.DataFrame(data_list, columns=["Coefficient", "Value"])
48
+
49
+ fig1 = px.bar(data_frame=df, x="Value", y="Coefficient", orientation="h")
50
+
51
+ fig1.update_layout(
52
+ title="Logistic Regression Coefficients",
53
+ xaxis_title="Value",
54
+ yaxis_title="Coefficient",
55
+ )
56
+
57
+ st.plotly_chart(fig1)
58
+
59
+ st.subheader("Classification Probability Threshold")
60
+
61
+ st.write(
62
+ """
63
+ The logistic regression model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
64
+ Probabilities of defaulting of the loans are compared to a probability threshold.\n
65
+ A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
66
+ """
67
+ )
68
+
69
+ threshold = st.slider(
70
+ label="Default Probability Threshold:",
71
+ min_value=0.0,
72
+ max_value=1.0,
73
+ value=0.7,
74
+ key="key_threshold",
75
+ )
76
+
77
+ clf_prediction_prob_df_log = model_probability_values_df(
78
+ clf_logistic_model,
79
+ split_dataset.X_test,
80
+ )
81
+
82
+ clf_thresh_predicted_default_status_user = (
83
+ apply_threshold_to_probability_values(
84
+ clf_prediction_prob_df_log,
85
+ threshold,
86
+ )
87
+ )
88
+
89
+ streamlit_2columns_metrics_df(
90
+ "# of Predicted Defaults",
91
+ "# of Predicted Non-Default",
92
+ clf_thresh_predicted_default_status_user,
93
+ )
94
+
95
+ streamlit_2columns_metrics_pct_df(
96
+ "% of Loans Predicted to Default",
97
+ "% of Loans Predicted not to Default",
98
+ clf_thresh_predicted_default_status_user,
99
+ )
100
+
101
+ threshold = logistic_threshold_view(clf_logistic_model, split_dataset)
102
+
103
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
104
+ logistic_evaluation_view(
105
+ clf_logistic_model,
106
+ split_dataset,
107
+ currency,
108
+ threshold.probability_threshold_selected,
109
+ threshold.predicted_default_status,
110
+ )
111
+ )
112
+
113
+ return ModelView(
114
+ model=clf_logistic_model,
115
+ trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
116
+ probability_threshold_selected=threshold.probability_threshold_selected,
117
+ predicted_default_status=threshold.predicted_default_status,
118
+ prediction_probability_df=threshold.prediction_probability_df,
119
+ )
views/model_comparison.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import OrderedDict
2
+ import streamlit as st
3
+ from sklearn.metrics import roc_auc_score
4
+ from common.data import SplitDataset
5
+ from common.views import (
6
+ roc_auc_compare_n_models,
7
+ streamlit_chart_setting_height_width,
8
+ calibration_curve_report_commented_n,
9
+ )
10
+ from views.typing import ModelView
11
+
12
+
13
+ def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelView):
14
+ roc_auc_model = roc_auc_score(
15
+ split_dataset.y_test, model_view.predicted_default_status
16
+ )
17
+
18
+ if roc_auc_model > 0.9:
19
+ roc_auc_lvl = f'Very good {"{:.2f}".format(roc_auc_model)} > 0.9)'
20
+ elif 0.8 < roc_auc_model < 0.9:
21
+ roc_auc_lvl = f'Good (0.8 < {"{:.2f}".format(roc_auc_model)} <0.9)'
22
+ elif 0.7 < roc_auc_model < 0.8:
23
+ roc_auc_lvl = f'Fair (0.7 < {"{:.2f}".format(roc_auc_model)} < 0.8)'
24
+ elif 0.6 < roc_auc_model < 0.7:
25
+ roc_auc_lvl = f'Poor (0.6 < {"{:.2f}".format(roc_auc_model)} < 0.7)'
26
+ else:
27
+ roc_auc_lvl = f'Fail ( {"{:.2f}".format(roc_auc_model)} < 0.6)'
28
+
29
+ return roc_auc_model, roc_auc_lvl
30
+
31
+
32
+ def model_comparison_view(
33
+ split_dataset: SplitDataset,
34
+ model_views: OrderedDict[str, ModelView],
35
+ ):
36
+ st.header("Model Comparison")
37
+
38
+ for model_name, model_view in model_views.items():
39
+ roc_auc_model, roc_auc_lvl = roc_auc_for_model(
40
+ split_dataset, model_view
41
+ )
42
+ st.subheader(
43
+ f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
44
+ )
45
+ st.markdown(
46
+ f'Area Under the Receiver Operating Characteristic Curve from prediction scores from "{model_name}" model is {roc_auc_model}.\n'
47
+ )
48
+ st.markdown(
49
+ f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
50
+ )
51
+ fig1 = roc_auc_compare_n_models(
52
+ split_dataset.y_test,
53
+ model_views,
54
+ )
55
+
56
+ fig1 = fig1.figure
57
+
58
+ (xsize_roc, ysize_roc) = streamlit_chart_setting_height_width(
59
+ "Chart Settings", 7, 7, "xsize_roc", "ysize_roc"
60
+ )
61
+
62
+ fig1.set_size_inches(xsize_roc, ysize_roc)
63
+
64
+ st.pyplot(fig1)
65
+
66
+ st.subheader("Models Calibration Curve")
67
+
68
+ fig2 = calibration_curve_report_commented_n(
69
+ split_dataset.y_test,
70
+ model_views,
71
+ 10,
72
+ )
73
+ fig2 = fig2.figure
74
+
75
+ (xsize_cal, ysize_cal) = streamlit_chart_setting_height_width(
76
+ "Chart Settings", 7, 7, "xsize_cal", "ysize_cal"
77
+ )
78
+
79
+ fig2.set_size_inches(xsize_cal, ysize_cal)
80
+
81
+ st.pyplot(fig2.figure)
views/strategy_table.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import OrderedDict
2
+ import plotly.express as px
3
+ import numpy as np
4
+ import streamlit as st
5
+ from common.util import create_strategyTable_df
6
+ from views.typing import ModelView
7
+
8
+
9
+ def strategy_table_view(
10
+ currency: str, model_views: OrderedDict[str, ModelView]
11
+ ):
12
+ st.header("Strategy Table")
13
+
14
+ for (model_name, model_view) in model_views.items():
15
+ st.subheader(model_name)
16
+ strat_df = create_strategyTable_df(
17
+ 0.05,
18
+ 1,
19
+ 20,
20
+ model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df,
21
+ "loan_status",
22
+ currency,
23
+ )
24
+
25
+ columns = strat_df.columns
26
+
27
+ with st.expander("Strategy Table:"):
28
+ st.write(strat_df)
29
+
30
+ for i in columns:
31
+ strat_df[i] = strat_df[i].astype(np.float64)
32
+
33
+ strat_df_boxPlot_data = strat_df.iloc[:, 0:3]
34
+
35
+ plot = px.box(data_frame=strat_df_boxPlot_data)
36
+
37
+ st.plotly_chart(plot)
38
+
39
+ # Plot the strategy curve
40
+
41
+ fig1 = px.line(
42
+ strat_df_boxPlot_data,
43
+ x="Acceptance Rate",
44
+ y="Bad Rate",
45
+ title="Acceptance and Bad Rates",
46
+ )
47
+
48
+ st.plotly_chart(fig1)
49
+
50
+ fig2 = px.line(
51
+ strat_df,
52
+ x="Acceptance Rate",
53
+ y=f"Estimated Value ({currency})",
54
+ title=f"Estimated Value ({currency}) by Acceptance Rate",
55
+ )
56
+
57
+ st.plotly_chart(fig2)
58
+
59
+ st.write("Row with the greatest estimated value:")
60
+
61
+ max_estimated_value = np.max(
62
+ strat_df[f"Estimated Value ({currency})"].astype(np.float64)
63
+ )
64
+ columns = strat_df.columns
65
+
66
+ max_estimated_value = np.max(strat_df[f"Estimated Value ({currency})"])
67
+
68
+ st.write(
69
+ strat_df.loc[
70
+ strat_df[f"Estimated Value ({currency})"]
71
+ == max_estimated_value
72
+ ]
73
+ )
74
+
75
+ loss_given_default = 1
76
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
77
+ model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
78
+ "PROB_DEFAULT"
79
+ ]
80
+ * loss_given_default
81
+ * model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
82
+ "loan_amnt"
83
+ ]
84
+ )
85
+
86
+ tot_exp_loss = round(
87
+ np.sum(df_trueStatus_probabilityDefault_threshStatus_loanAmount),
88
+ 2,
89
+ )
90
+
91
+ st.metric(
92
+ label=f"Total expected loss:",
93
+ value=f"{currency} {tot_exp_loss:,.2f}",
94
+ delta=None,
95
+ delta_color="normal",
96
+ )
views/threshold.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Union, cast
3
+ import numpy as np
4
+ import streamlit as st
5
+ import plotly.express as px
6
+ import pandas as pd
7
+ from xgboost.sklearn import XGBClassifier
8
+ from sklearn.linear_model import LogisticRegression
9
+ from common.data import SplitDataset
10
+ from common.util import (
11
+ model_probability_values_df,
12
+ apply_threshold_to_probability_values,
13
+ find_best_threshold_J_statistic,
14
+ default_status_per_threshold,
15
+ classification_report_per_threshold,
16
+ thresh_classification_report_recall_accuracy,
17
+ )
18
+ from common.views import (
19
+ streamlit_2columns_metrics_df,
20
+ streamlit_2columns_metrics_pct_df,
21
+ )
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class Threshold:
26
+ probability_threshold_selected: float
27
+ predicted_default_status: pd.Series
28
+ prediction_probability_df: pd.DataFrame
29
+
30
+
31
+ def make_threshold_view(
32
+ model_name_short: str,
33
+ model_name: str,
34
+ ):
35
+ def view(
36
+ clf_gbt_model: Union[XGBClassifier, LogisticRegression],
37
+ split_dataset: SplitDataset,
38
+ ) -> Threshold:
39
+ st.subheader("Classification Probability Threshold - User Defined")
40
+ st.write(
41
+ f"""
42
+ The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
43
+ Probabilities of defaulting of the loans are compared to a probability threshold.\n
44
+ A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
45
+ """
46
+ )
47
+
48
+ threshold_gbt_default = st.slider(
49
+ label="Default Probability Threshold:",
50
+ min_value=0.0,
51
+ max_value=1.0,
52
+ value=0.8,
53
+ key=f"threshold_{model_name_short}_default",
54
+ )
55
+
56
+ clf_prediction_prob_df_gbt = model_probability_values_df(
57
+ clf_gbt_model,
58
+ split_dataset.X_test,
59
+ )
60
+
61
+ clf_thresh_predicted_default_status_user_gbt = (
62
+ apply_threshold_to_probability_values(
63
+ clf_prediction_prob_df_gbt,
64
+ threshold_gbt_default,
65
+ )
66
+ )
67
+
68
+ streamlit_2columns_metrics_df(
69
+ "# of Predicted Defaults",
70
+ "# of Predicted Non-Default",
71
+ clf_thresh_predicted_default_status_user_gbt,
72
+ )
73
+
74
+ streamlit_2columns_metrics_pct_df(
75
+ "% of Loans Predicted to Default",
76
+ "% of Loans Predicted not to Default",
77
+ clf_thresh_predicted_default_status_user_gbt,
78
+ )
79
+
80
+ st.subheader("J Statistic Driven Classification Probability Threshold")
81
+
82
+ J_statistic_best_threshold = find_best_threshold_J_statistic(
83
+ split_dataset.y_test, clf_prediction_prob_df_gbt
84
+ )
85
+ st.metric(
86
+ label="Youden's J statistic calculated best threshold",
87
+ value=J_statistic_best_threshold,
88
+ )
89
+
90
+ clf_thresh_predicted_default_status_Jstatistic_gbt = (
91
+ apply_threshold_to_probability_values(
92
+ clf_prediction_prob_df_gbt,
93
+ J_statistic_best_threshold,
94
+ )
95
+ )
96
+
97
+ streamlit_2columns_metrics_df(
98
+ "# of Predicted Defaults",
99
+ "# of Predicted Non-Default",
100
+ clf_thresh_predicted_default_status_Jstatistic_gbt,
101
+ )
102
+
103
+ streamlit_2columns_metrics_pct_df(
104
+ "% of Loans Predicted to Default",
105
+ "% of Loans Predicted not to Default",
106
+ clf_thresh_predicted_default_status_Jstatistic_gbt,
107
+ )
108
+
109
+ st.subheader(
110
+ "Recall and Accuracy Tradeoff with given Probability Threshold"
111
+ )
112
+ # Steps
113
+ # Get list of thresholds
114
+ # Get default status per threshold
115
+ # Get classification report per threshold
116
+ # Get recall, nondef recall, and accuracy per threshold
117
+
118
+ threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist()
119
+
120
+ threshold_default_status_list = default_status_per_threshold(
121
+ threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
122
+ )
123
+ thresh_classification_report_dict = (
124
+ classification_report_per_threshold(
125
+ threshold_list,
126
+ threshold_default_status_list,
127
+ split_dataset.y_test,
128
+ )
129
+ )
130
+
131
+ (
132
+ thresh_def_recalls_list,
133
+ thresh_nondef_recalls_list,
134
+ thresh_accs_list,
135
+ ) = thresh_classification_report_recall_accuracy(
136
+ thresh_classification_report_dict
137
+ )
138
+
139
+ namelist = [
140
+ "Default Recall",
141
+ "Non Default Recall",
142
+ "Accuracy",
143
+ "Threshold",
144
+ ]
145
+
146
+ df = pd.DataFrame(
147
+ [
148
+ thresh_def_recalls_list,
149
+ thresh_nondef_recalls_list,
150
+ thresh_accs_list,
151
+ threshold_list,
152
+ ],
153
+ index=namelist,
154
+ )
155
+
156
+ df = df.T
157
+
158
+ fig2 = px.line(
159
+ data_frame=df,
160
+ y=["Default Recall", "Non Default Recall", "Accuracy"],
161
+ x="Threshold",
162
+ )
163
+
164
+ fig2.update_layout(
165
+ title="Recall and Accuracy score Trade-off with Probability Threshold",
166
+ xaxis_title="Probability Threshold",
167
+ yaxis_title="Score",
168
+ )
169
+ fig2.update_yaxes(range=[0.0, 1.0])
170
+
171
+ st.plotly_chart(fig2)
172
+
173
+ st.subheader("Acceptance Rate Driven Probability Threshold")
174
+ # Steps
175
+ # Set acceptance rate
176
+ # Get default status per threshold
177
+ # Get classification report per threshold
178
+ # Get recall, nondef recall, and accuracy per threshold
179
+
180
+ acceptance_rate = (
181
+ st.slider(
182
+ label="% of loans accepted (acceptance rate):",
183
+ min_value=0,
184
+ max_value=100,
185
+ value=85,
186
+ key=f"acceptance_rate_{model_name_short}",
187
+ format="%f%%",
188
+ )
189
+ / 100
190
+ )
191
+
192
+ acc_rate_thresh_gbt = np.quantile(
193
+ clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
194
+ )
195
+
196
+ st.write(
197
+ f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
198
+ )
199
+
200
+ figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
201
+
202
+ figa.update_layout(
203
+ title="Acceptance Rate Threshold vs. Loans Accepted",
204
+ xaxis_title="Acceptance Rate Threshold",
205
+ yaxis_title="Loans Accepted",
206
+ )
207
+
208
+ figa.update_traces(marker_line_width=1, marker_line_color="white")
209
+
210
+ figa.add_vline(
211
+ x=acc_rate_thresh_gbt,
212
+ line_width=3,
213
+ line_dash="solid",
214
+ line_color="red",
215
+ )
216
+
217
+ st.plotly_chart(figa)
218
+
219
+ clf_thresh_predicted_default_status_acceptance_gbt = (
220
+ apply_threshold_to_probability_values(
221
+ clf_prediction_prob_df_gbt,
222
+ acc_rate_thresh_gbt,
223
+ )
224
+ )
225
+
226
+ st.write()
227
+ st.subheader("Selected Probability Threshold")
228
+
229
+ options = [
230
+ "User Defined",
231
+ "J Statistic Driven",
232
+ "Acceptance Rate Driven",
233
+ ]
234
+ prob_thresh_option = st.radio(
235
+ label="Selected Probability Threshold",
236
+ options=options,
237
+ key=f"{model_name_short}_radio_thresh",
238
+ )
239
+
240
+ if prob_thresh_option == "User Defined":
241
+ prob_thresh_selected_gbt = threshold_gbt_default
242
+ predicted_default_status_gbt = (
243
+ clf_thresh_predicted_default_status_user_gbt
244
+ )
245
+ elif prob_thresh_option == "J Statistic Driven":
246
+ prob_thresh_selected_gbt = J_statistic_best_threshold
247
+ predicted_default_status_gbt = (
248
+ clf_thresh_predicted_default_status_Jstatistic_gbt
249
+ )
250
+ else:
251
+ prob_thresh_selected_gbt = acc_rate_thresh_gbt
252
+ predicted_default_status_gbt = (
253
+ clf_thresh_predicted_default_status_acceptance_gbt
254
+ )
255
+
256
+ st.write(
257
+ f"Selected probability threshold is {prob_thresh_selected_gbt}"
258
+ )
259
+
260
+ return Threshold(
261
+ probability_threshold_selected=cast(
262
+ float, prob_thresh_selected_gbt
263
+ ),
264
+ predicted_default_status=predicted_default_status_gbt,
265
+ prediction_probability_df=clf_prediction_prob_df_gbt,
266
+ )
267
+
268
+ return view
269
+
270
+
271
+ decision_tree_threshold_view = make_threshold_view("gbt", "decision tree")
272
+ logistic_threshold_view = make_threshold_view("lg", "logistic")
views/typing.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Union
3
+
4
+ import pandas as pd
5
+ from xgboost.sklearn import XGBClassifier
6
+ from sklearn.linear_model import LogisticRegression
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ModelView:
11
+ model: Union[XGBClassifier, LogisticRegression]
12
+ probability_threshold_selected: float
13
+ predicted_default_status: pd.Series
14
+ trueStatus_probabilityDefault_threshStatus_loanAmount_df: pd.DataFrame
15
+ prediction_probability_df: pd.DataFrame