File size: 11,102 Bytes
7d861ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
# DATA MANIPULATION & ANALYSIS

import pickle
import streamlit as st

# Arrays
import numpy as np

# DataFrames and Series
import pandas as pd

# Returns the indices of the maximum values along an axis
from numpy import argmax

# MODELLING

# Logistic regression
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold

# XGBoosted Decision Trees
import xgboost as xgb


# REPORTING, EVALUATION, AND INTERPRETATION

# Classification report
from sklearn.metrics import classification_report

# Reciever Operator Curve
from sklearn.metrics import roc_curve


# Evaluate a score by cross-validation
from sklearn.model_selection import cross_val_score


# # Functions


def drop_columns(df, columns):
    return df.drop(columns, axis=1)


def remove_less_than_0_columns(df, column):
    df[column].dropna()
    return df.loc[(df[column] != 0).any(1)]


def boolean_int_condition_label(df, label_column_name, condition):
    df[label_column_name] = condition
    y = df[label_column_name].astype(int)
    df = drop_columns(df, label_column_name)
    return y, df


@st.cache(suppress_st_warning=True)
def undersample_training_data(
    df: pd.DataFrame, column_name: str, split_dataset
):
    count_nondefault, count_default = split_dataset.X_y_train[
        column_name
    ].value_counts()

    nondefaults = df[df[column_name] == 0]  # 0

    defaults = df[df[column_name] == 1]

    under_sample = min(count_nondefault, count_default)

    nondefaults_under = nondefaults.sample(under_sample)

    defaults_under = defaults.sample(under_sample)

    X_y_train_under = pd.concat(
        [
            nondefaults_under.reset_index(drop=True),
            defaults_under.reset_index(drop=True),
        ],
        axis=0,
    )

    X_train_under = X_y_train_under.drop([column_name], axis=1)  # remove label

    y_train_under = X_y_train_under[column_name]  # label only

    class_balance_default = X_y_train_under[column_name].value_counts()

    return [
        X_train_under,
        y_train_under,
        X_y_train_under,
        class_balance_default,
    ]


def create_coeffient_feature_dictionary_logistic_model(
    logistic_model, training_data
):
    return {
        feat: coef
        for coef, feat in zip(
            logistic_model.coef_[0, :], training_data.columns
        )
    }


@st.cache(suppress_st_warning=True)
def test_variables_logistic(X_train, y_train):
    # Create and fit the logistic regression model
    return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))


@st.cache(suppress_st_warning=True)
def print_coeff_logistic(clf_logistic_model, split_dataset):
    # Dictionary of features and their coefficients
    return create_coeffient_feature_dictionary_logistic_model(
        clf_logistic_model, split_dataset.X_train
    )


@st.cache(suppress_st_warning=True, hash_funcs={
    xgb.XGBClassifier: pickle.dumps
})
def test_variables_gbt(X_train, y_train):
    # Using hyperparameters learning_rate and max_depth
    return xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        use_label_encoder=False,
        eval_metric="logloss",
    ).fit(X_train, np.ravel(y_train), eval_metric="logloss")


# In[398]:


def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
    model, X, y, threshold, loan_amount_col_name
):
    true_status = y.to_frame()

    loan_amount = X[loan_amount_col_name]

    clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))

    clf_prediction_prob_df = pd.DataFrame(
        clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
    )

    clf_thresh_predicted_default_status = (
        clf_prediction_prob_df["PROB_DEFAULT"]
        .apply(lambda x: 1 if x > threshold else 0)
        .rename("PREDICT_DEFAULT_STATUS")
    )

    return pd.concat(
        [
            true_status.reset_index(drop=True),
            clf_prediction_prob_df.reset_index(drop=True),
            clf_thresh_predicted_default_status.reset_index(drop=True),
            loan_amount.reset_index(drop=True),
        ],
        axis=1,
    )


def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
    fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
    # get the best threshold
    # Youden’s J statistic tpr-fpr
    # Argmax to get the index in
    # thresholds
    return thresholds[argmax(tpr - fpr)]


# In[399]:


# Function that makes dataframe with probability of default, predicted default status based on threshold
# and actual default status


def model_probability_values_df(model, X):
    return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])


def apply_threshold_to_probability_values(probability_values, threshold):
    return (
        probability_values["PROB_DEFAULT"]
        .apply(lambda x: 1 if x > threshold else 0)
        .rename("PREDICT_DEFAULT_STATUS")
    )


@st.cache(suppress_st_warning=True)
def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
    fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
    # get the best threshold
    J = tpr - fpr  # Youden’s J statistic
    ix = argmax(J)
    return thresholds[ix]


# In[401]:


def create_cross_validation_df(
    X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
):
    # Test data x and y
    DTrain = xgb.DMatrix(X, label=y)

    # auc or logloss
    params = {
        "eval_metric": eval_metric,
        "objective": "binary:logistic",  # logistic say 0 or 1 for loan status
        "seed": seed,
    }

    # Create the data frame of cross validations
    cv_df = xgb.cv(
        params,
        DTrain,
        num_boost_round=trees,
        nfold=n_folds,
        early_stopping_rounds=early_stopping_rounds,
        shuffle=True,
    )

    return [DTrain, cv_df]


# In[450]:


def cross_validation_scores(model, X, y, nfold, score, seed):
    # return cv scores of metric
    return cross_val_score(
        model,
        np.ascontiguousarray(X),
        np.ravel(np.ascontiguousarray(y)),
        cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
        scoring=score,
    )


def default_status_per_threshold(threshold_list, prob_default):
    threshold_default_status_list = []
    for threshold in threshold_list:
        threshold_default_status = prob_default.apply(
            lambda x: 1 if x > threshold else 0
        )
        threshold_default_status_list.append(threshold_default_status)
    return threshold_default_status_list


def classification_report_per_threshold(
    threshold_list, threshold_default_status_list, y_test
):
    target_names = ["Non-Default", "Default"]
    classification_report_list = []
    for threshold_default_status in threshold_default_status_list:
        thresh_classification_report = classification_report(
            y_test,
            threshold_default_status,
            target_names=target_names,
            output_dict=True,
            zero_division=0,
        )
        classification_report_list.append(thresh_classification_report)
    # Return threshold classification report dict
    return dict(zip(threshold_list, classification_report_list))


def thresh_classification_report_recall_accuracy(
    thresh_classification_report_dict,
):
    thresh_def_recalls_list = []
    thresh_nondef_recalls_list = []
    thresh_accs_list = []
    for x in [*thresh_classification_report_dict]:
        thresh_def_recall = thresh_classification_report_dict[x]["Default"][
            "recall"
        ]
        thresh_def_recalls_list.append(thresh_def_recall)
        thresh_nondef_recall = thresh_classification_report_dict[x][
            "Non-Default"
        ]["recall"]
        thresh_nondef_recalls_list.append(thresh_nondef_recall)
        thresh_accs = thresh_classification_report_dict[x]["accuracy"]
        thresh_accs_list.append(thresh_accs)
    return [
        thresh_def_recalls_list,
        thresh_nondef_recalls_list,
        thresh_accs_list,
    ]


def create_accept_rate_list(start, end, samples):
    return np.linspace(start, end, samples, endpoint=True)


def create_strategyTable_df(
    start, end, samples, actual_probability_predicted_acc_rate, true, currency
):
    accept_rates = create_accept_rate_list(start, end, samples)
    thresholds_strat = []
    bad_rates_start = []
    Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
    num_accepted_loans_start = []

    for rate in accept_rates:
        # Calculate the threshold for the acceptance rate
        thresh = np.quantile(
            actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
        ).round(3)
        # Add the threshold value to the list of thresholds
        thresholds_strat.append(
            np.quantile(
                actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
            ).round(3)
        )

        # Reassign the loan_status value using the threshold
        actual_probability_predicted_acc_rate[
            "PREDICT_DEFAULT_STATUS"
        ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
            lambda x: 1 if x > thresh else 0
        )

        # Create a set of accepted loans using this acceptance rate
        accepted_loans = actual_probability_predicted_acc_rate[
            actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
            == 0
        ]
        # Calculate and append the bad rate using the acceptance rate
        bad_rates_start.append(
            np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
        )
        # Accepted loans
        num_accepted_loans_start.append(len(accepted_loans))

    # Calculate estimated value
    money_accepted_loans = [
        accepted_loans * Avg_Loan_Amnt
        for accepted_loans in num_accepted_loans_start
    ]

    money_bad_accepted_loans = [
        2 * money_accepted_loan * bad_rate
        for money_accepted_loan, bad_rate in zip(
            money_accepted_loans, bad_rates_start
        )
    ]

    zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
    estimated_value = [
        money_accepted_loan - money_bad_accepted_loan
        for money_accepted_loan, money_bad_accepted_loan in zip_object
    ]

    accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]

    thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]

    bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]

    estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]

    return (
        pd.DataFrame(
            zip(
                accept_rates,
                thresholds_strat,
                bad_rates_start,
                num_accepted_loans_start,
                estimated_value,
            ),
            columns=[
                "Acceptance Rate",
                "Threshold",
                "Bad Rate",
                "Num Accepted Loans",
                f"Estimated Value ({currency})",
            ],
        )
        .sort_values(by="Acceptance Rate", axis=0, ascending=False)
        .reset_index(drop=True)
    )