Paul Kiage commited on
Commit
7d861ad
β€’
1 Parent(s): 9af3e2b

Hugging Face Deployment Setup (#11)

Browse files

* refactor for hugging face space deployment

* docs: HF space branch

Files changed (42) hide show
  1. .github/workflows/check_file_size.yml +16 -0
  2. .github/workflows/sync_to_hf_hub.yml +20 -0
  3. Dockerfile +20 -0
  4. Procfile +0 -1
  5. README.md +22 -58
  6. src/app.py β†’ app.py +27 -29
  7. {src β†’ common}/__init__.py +0 -0
  8. src/features/util_build_features.py β†’ common/data.py +2 -93
  9. common/util.py +391 -0
  10. common/views.py +361 -0
  11. src/features/build_features.py β†’ data_setup.py +42 -15
  12. requirements.txt +0 -0
  13. setup.py +0 -10
  14. setup.sh +0 -13
  15. src/__main__.py +0 -0
  16. src/models/__init__.py +0 -0
  17. src/models/logistic_model.py +0 -33
  18. src/models/logistic_predict_model.py +0 -4
  19. src/models/logistic_test_model.py +0 -4
  20. src/models/logistic_train_model.py +0 -69
  21. src/models/util_predict_model.py +0 -87
  22. src/models/util_predict_model_threshold.py +0 -310
  23. src/models/xgboost_model.py +0 -33
  24. src/models/xgboost_predict_model.py +0 -4
  25. src/models/xgboost_test_model.py +0 -4
  26. src/models/xgboost_train_model.py +0 -68
  27. src/visualization/__init__.py +0 -0
  28. src/visualization/graphs_decision_tree.py +0 -23
  29. src/visualization/graphs_download.py +0 -17
  30. src/visualization/graphs_logistic.py +0 -12
  31. src/visualization/graphs_settings.py +0 -28
  32. src/visualization/graphs_test.py +0 -78
  33. src/visualization/graphs_threshold.py +0 -80
  34. src/visualization/metrics.py +0 -132
  35. {src/features β†’ views}/__init__.py +0 -0
  36. views/decision_tree.py +70 -0
  37. src/models/util_test.py β†’ views/evaluation.py +11 -169
  38. views/logistic.py +119 -0
  39. src/models/util_model_comparison.py β†’ views/model_comparison.py +9 -14
  40. src/models/util_strategy_table.py β†’ views/strategy_table.py +4 -4
  41. views/threshold.py +272 -0
  42. src/models/util_model_class.py β†’ views/typing.py +1 -1
.github/workflows/check_file_size.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/lfs-warning@v2.0
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/sync_to_hf_hub.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push https://pkiage:$HF_TOKEN@huggingface.co/spaces/pkiage/credit_risk_modeling_demo main
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN apt update
7
+
8
+ RUN apt install -y graphviz
9
+
10
+ WORKDIR /code
11
+
12
+ COPY ./requirements.txt /code/requirements.txt
13
+
14
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
15
+
16
+ COPY . .
17
+
18
+ CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0"]
19
+
20
+ # CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
Procfile DELETED
@@ -1 +0,0 @@
1
- web: sh setup.sh && streamlit run src/app.py
 
 
README.md CHANGED
@@ -1,3 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
1
  # Credit Risk Modelling
2
 
3
  # About
@@ -72,68 +83,29 @@ pip install -r requirements.txt
72
 
73
  https://graphviz.org/download/
74
 
75
- ## Build and install local package
76
-
77
- ```shell
78
- python setup.py build
79
- ```
80
-
81
- ```shell
82
- python setup.py install
83
- ```
84
 
85
  ### Run the streamlit app (app.py) by running the following in terminal (from repository root folder):
86
 
87
  ```shell
88
- streamlit run src/app.py
89
  ```
90
 
91
  ## Deployed setup details
92
 
93
- For faster model building and testing (particularly XGBoost) a local setup or on a more powerful server than free heroku dyno type is recommended. ([tutorials on servers for data science & ML](https://course.fast.ai))
94
-
95
- ⚠️⚠️⚠️
96
-
97
- ***UPDATE: In [Heroku’s Next Chapter](https://blog.heroku.com/next-chapter) free dynos will be removed starting [November 28, 2022](https://help.heroku.com/RSBRUH58/removal-of-heroku-free-product-plans-faq)***
98
-
99
- *[Hosting Streamlit app would require](https://discuss.streamlit.io/t/can-i-host-streamlit-on-now-sh-vercel/3189) a Platform as a service (PaaS) since [Streamlit apps aren't static thus can't run on static web host](https://discuss.streamlit.io/t/hosting-streamlit-on-github-pages/356/2).*
100
-
101
- *Viable alternatives include paid services such as AWS, Azure, GCP, DigitalOcean, Heroku, [Replit](https://replit.com/heroku) paid version (due to Repl Resources used) etc.*
102
-
103
- *Platforms such as Github Pages, Netifly, & Vercel currenty mostly require the app to [output a static website](https://answers.netlify.com/t/how-to-run-streamlit-hello-on-netlify/11899/2) since most of those services will not run Python ([or any server process](https://answers.netlify.com/t/support-guide-can-i-run-a-web-server-http-listener-and-or-database-at-netlify/3078)) at browse time. Netifly for instance is designed for the [Jamstack](https://jamstack.org/) that doesn't depend on a "web server". Vercel on the other hand requires either a [`handler` that inherits from the `BaseHTTPRequestHandler` class or an app that exposes a WSGI or ASGI Application](https://vercel.com/docs/runtimes#advanced-usage/advanced-python-usage) - [Tornado](https://www.tornadoweb.org/en/stable/index.html?highlight=wsgi#threads-and-wsgi) a [dependency of Streamlit](https://openbase.com/python/streamlit/dependencies) is [currently not compatible with WSGI](https://www.reddit.com/r/learnpython/comments/grmjfo/comment/fs4elmx/).*
104
-
105
- Currently hosted on [Streamlit Community Cloud](https://blog.streamlit.io/host-your-streamlit-app-for-free/)
106
-
107
- ⚠️⚠️⚠️
108
-
109
- [Free Heroku dyno type](https://devcenter.heroku.com/articles/dyno-types) was used to deploy the app
110
 
111
- Memory (RAM): 512 MB
112
-
113
- CPU Share: 1x
114
-
115
- Compute: 1x-4x
116
-
117
- Dedicated: no
118
-
119
- Sleeps: yes
120
-
121
- [Enabled Autodeploy from Github](https://devcenter.heroku.com/articles/github-integration) if want to [manually deploy to Heroku](https://devcenter.heroku.com/articles/git#deploy-your-code) the steps are as follows:
122
-
123
- From main branch:
124
- ```shell
125
- heroku login
126
-
127
- git push heroku main
128
- ```
129
-
130
- From branch beside main:
131
 
132
  ```shell
133
- heroku login
134
 
135
- git push heroku branch_name:main
136
  ```
 
 
137
 
138
  # Roadmap
139
 
@@ -222,12 +194,4 @@ code2flow src/models/util_model_comparison.py -o docs/call-graph/util_model_comp
222
 
223
  [A Gentle Introduction to Threshold-Moving for Imbalanced Classification](https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/)
224
 
225
- - Selecting optimal threshold using Youden's J statistic
226
-
227
- [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/)
228
-
229
- - Project structure
230
-
231
- [GraphViz Buildpack](https://github.com/weibeld/heroku-buildpack-graphviz)
232
-
233
- - Buildpack used for Heroku deployment
 
1
+ ---
2
+ title: Credit Risk Modeling
3
+ emoji: πŸ“ˆ
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: docker
7
+ app_port: 8501
8
+ pinned: false
9
+ license: openrail
10
+ ---
11
+
12
  # Credit Risk Modelling
13
 
14
  # About
 
83
 
84
  https://graphviz.org/download/
85
 
 
 
 
 
 
 
 
 
 
86
 
87
  ### Run the streamlit app (app.py) by running the following in terminal (from repository root folder):
88
 
89
  ```shell
90
+ streamlit app.py
91
  ```
92
 
93
  ## Deployed setup details
94
 
95
+ **Hugging Face Space Deployment Tips**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ Initial Setup
98
+ - [When creating the Spaces Configuration Reference](https://huggingface.co/docs/hub/spaces-config-reference) check logs to specify the [Docker Space](https://huggingface.co/docs/hub/spaces-sdks-docker) app_port based on build
99
+ - In Dockerfile bind Streamlit to a port e.g. 0.0.0.0
100
+ - [Install Graphiz on Debian](https://installati.one/debian/11/graphviz/) rather than use Streamlit Space to solve ```failed to execute posixpath('dot'), make sure the graphviz executables are on your systems' path``` error given don't have access to terminal with Streamlit Space
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  ```shell
103
+ git remote add space https://huggingface.co/spaces/pkiage/credit_risk_modeling_demo
104
 
105
+ git push --force space main
106
  ```
107
+ - [When syncing with Hugging Face via Github Actions](https://huggingface.co/docs/hub/spaces-github-actions) the [User Access Token](https://huggingface.co/docs/hub/security-tokens) created on Hugging Face (HF) should have write access
108
+ - Run space from main branch since running from [other branches currently isn't suppported](https://discuss.huggingface.co/t/is-it-possible-to-run-apps-off-of-non-main-branches-in-a-space/18086)
109
 
110
  # Roadmap
111
 
 
194
 
195
  [A Gentle Introduction to Threshold-Moving for Imbalanced Classification](https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/)
196
 
197
+ - Selecting optimal threshold using Youden's J statistic
 
 
 
 
 
 
 
 
src/app.py β†’ app.py RENAMED
@@ -1,24 +1,17 @@
1
- import streamlit as st
2
  from typing import OrderedDict
3
-
4
-
5
- from features.build_features import initialise_data
6
-
7
- from models.xgboost_model import xgboost_class
8
- from models.logistic_model import logistic_class
9
-
10
- from models.util_model_comparison import model_comparison_view
11
-
12
- from models.util_strategy_table import strategy_table_view
13
 
14
 
15
  def main():
16
-
17
- st.write("Source code: https://github.com/pkiage/tool-credit-risk-modelling")
18
  currency_options = ["USD", "KES", "GBP"]
19
 
20
- model_options = ["XGBoost", "Logistic"]
21
-
22
  currency = st.sidebar.selectbox(
23
  label="What currency will you be using?", options=currency_options
24
  )
@@ -31,25 +24,30 @@ def main():
31
 
32
  st.title("Modelling")
33
 
 
 
 
34
  models_selected_list = st.sidebar.multiselect(
35
  label="Select model", options=model_options, default=model_options
36
  )
37
 
38
  models_selected_set = set(models_selected_list)
39
-
40
- model_classes = OrderedDict()
41
-
42
- if "Logistic" in models_selected_set:
43
- logistic_model_class = logistic_class(split_dataset, currency)
44
- model_classes["Logistic"] = logistic_model_class
45
-
46
- if "XGBoost" in models_selected_set:
47
- xgboost_model_class = xgboost_class(split_dataset, currency)
48
- model_classes["XGBoost"] = xgboost_model_class
49
-
50
- model_comparison_view(split_dataset, model_classes)
51
-
52
- strategy_table_view(currency, model_classes)
 
 
53
 
54
 
55
  if __name__ == "__main__":
 
 
1
  from typing import OrderedDict
2
+ import streamlit as st
3
+ from data_setup import initialise_data
4
+ from views.decision_tree import decisiontree_view
5
+ from views.logistic import logistic_view
6
+ from views.model_comparison import model_comparison_view
7
+ from views.strategy_table import strategy_table_view
8
+ import os
9
+ os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz0.19.1/bin/'
 
 
10
 
11
 
12
  def main():
 
 
13
  currency_options = ["USD", "KES", "GBP"]
14
 
 
 
15
  currency = st.sidebar.selectbox(
16
  label="What currency will you be using?", options=currency_options
17
  )
 
24
 
25
  st.title("Modelling")
26
 
27
+ model_options = ["Logistic Regression", "Decision Trees"]
28
+
29
+ # Returns list
30
  models_selected_list = st.sidebar.multiselect(
31
  label="Select model", options=model_options, default=model_options
32
  )
33
 
34
  models_selected_set = set(models_selected_list)
35
+ model_views = OrderedDict()
36
+
37
+ if "Logistic Regression" in models_selected_set:
38
+ logistic_model_view = logistic_view(split_dataset, currency)
39
+ model_views["Logistic Regression"] = logistic_model_view
40
+
41
+ if "Decision Trees" in models_selected_set:
42
+ decision_tree_model_view = decisiontree_view(split_dataset, currency)
43
+ model_views["Decision Trees"] = decision_tree_model_view
44
+
45
+ if models_selected_list:
46
+ model_comparison_view(
47
+ split_dataset,
48
+ model_views,
49
+ )
50
+ strategy_table_view(currency, model_views)
51
 
52
 
53
  if __name__ == "__main__":
{src β†’ common}/__init__.py RENAMED
File without changes
src/features/util_build_features.py β†’ common/data.py RENAMED
@@ -1,13 +1,10 @@
1
- import streamlit as st
2
-
3
  from typing import List, Union, cast
4
-
5
  from dataclasses import dataclass
6
-
7
  from sklearn.model_selection import train_test_split
8
-
9
  import pandas as pd
10
 
 
 
11
 
12
  @dataclass
13
  class SplitDataset:
@@ -95,91 +92,3 @@ class Dataset:
95
  y_train=cast(pd.Series, y_train),
96
  y_test=cast(pd.Series, y_test),
97
  )
98
-
99
-
100
- def drop_columns(df, columns):
101
- return df.drop(columns, axis=1)
102
-
103
-
104
- def remove_less_than_0_columns(df, column):
105
- df[column].dropna()
106
- return df.loc[(df[column] != 0).any(1)]
107
-
108
-
109
- def boolean_int_condition_label(df, label_column_name, condition):
110
- df[label_column_name] = condition
111
- y = df[label_column_name].astype(int)
112
- df = drop_columns(df, label_column_name)
113
- return y, df
114
-
115
-
116
- @st.cache(suppress_st_warning=True)
117
- def undersample_training_data(
118
- df: pd.DataFrame, column_name: str, split_dataset
119
- ):
120
- count_nondefault, count_default = split_dataset.X_y_train[
121
- column_name
122
- ].value_counts()
123
-
124
- nondefaults = df[df[column_name] == 0] # 0
125
-
126
- defaults = df[df[column_name] == 1]
127
-
128
- under_sample = min(count_nondefault, count_default)
129
-
130
- nondefaults_under = nondefaults.sample(under_sample)
131
-
132
- defaults_under = defaults.sample(under_sample)
133
-
134
- X_y_train_under = pd.concat(
135
- [
136
- nondefaults_under.reset_index(drop=True),
137
- defaults_under.reset_index(drop=True),
138
- ],
139
- axis=0,
140
- )
141
-
142
- X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
143
-
144
- y_train_under = X_y_train_under[column_name] # label only
145
-
146
- class_balance_default = X_y_train_under[column_name].value_counts()
147
-
148
- return [
149
- X_train_under,
150
- y_train_under,
151
- X_y_train_under,
152
- class_balance_default,
153
- ]
154
-
155
-
156
- def select_predictors(dataset):
157
- st.header("Predictors")
158
-
159
- possible_columns = dataset.x_values_column_names
160
-
161
- selected_columns = st.sidebar.multiselect(
162
- label="Select Predictors",
163
- options=possible_columns,
164
- default=possible_columns,
165
- )
166
- return dataset.x_values_filtered_columns(selected_columns)
167
-
168
-
169
- def import_data():
170
- if "input_data_frame" not in st.session_state:
171
- st.session_state.input_data_frame = pd.read_csv(
172
- r"./data/processed/cr_loan_w2.csv"
173
- )
174
- if "dataset" not in st.session_state:
175
- df = cast(pd.DataFrame, st.session_state.input_data_frame)
176
- dataset = Dataset(
177
- df=df,
178
- random_state=123235,
179
- test_size=40,
180
- )
181
- st.session_state.dataset = dataset
182
- else:
183
- dataset = st.session_state.dataset
184
-
185
- return dataset
 
 
 
1
  from typing import List, Union, cast
 
2
  from dataclasses import dataclass
 
3
  from sklearn.model_selection import train_test_split
 
4
  import pandas as pd
5
 
6
+ from common.util import drop_columns
7
+
8
 
9
  @dataclass
10
  class SplitDataset:
 
92
  y_train=cast(pd.Series, y_train),
93
  y_test=cast(pd.Series, y_test),
94
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
common/util.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DATA MANIPULATION & ANALYSIS
2
+
3
+ import pickle
4
+ import streamlit as st
5
+
6
+ # Arrays
7
+ import numpy as np
8
+
9
+ # DataFrames and Series
10
+ import pandas as pd
11
+
12
+ # Returns the indices of the maximum values along an axis
13
+ from numpy import argmax
14
+
15
+ # MODELLING
16
+
17
+ # Logistic regression
18
+ from sklearn.linear_model import LogisticRegression
19
+
20
+ from sklearn.model_selection import StratifiedKFold
21
+
22
+ # XGBoosted Decision Trees
23
+ import xgboost as xgb
24
+
25
+
26
+ # REPORTING, EVALUATION, AND INTERPRETATION
27
+
28
+ # Classification report
29
+ from sklearn.metrics import classification_report
30
+
31
+ # Reciever Operator Curve
32
+ from sklearn.metrics import roc_curve
33
+
34
+
35
+ # Evaluate a score by cross-validation
36
+ from sklearn.model_selection import cross_val_score
37
+
38
+
39
+ # # Functions
40
+
41
+
42
+ def drop_columns(df, columns):
43
+ return df.drop(columns, axis=1)
44
+
45
+
46
+ def remove_less_than_0_columns(df, column):
47
+ df[column].dropna()
48
+ return df.loc[(df[column] != 0).any(1)]
49
+
50
+
51
+ def boolean_int_condition_label(df, label_column_name, condition):
52
+ df[label_column_name] = condition
53
+ y = df[label_column_name].astype(int)
54
+ df = drop_columns(df, label_column_name)
55
+ return y, df
56
+
57
+
58
+ @st.cache(suppress_st_warning=True)
59
+ def undersample_training_data(
60
+ df: pd.DataFrame, column_name: str, split_dataset
61
+ ):
62
+ count_nondefault, count_default = split_dataset.X_y_train[
63
+ column_name
64
+ ].value_counts()
65
+
66
+ nondefaults = df[df[column_name] == 0] # 0
67
+
68
+ defaults = df[df[column_name] == 1]
69
+
70
+ under_sample = min(count_nondefault, count_default)
71
+
72
+ nondefaults_under = nondefaults.sample(under_sample)
73
+
74
+ defaults_under = defaults.sample(under_sample)
75
+
76
+ X_y_train_under = pd.concat(
77
+ [
78
+ nondefaults_under.reset_index(drop=True),
79
+ defaults_under.reset_index(drop=True),
80
+ ],
81
+ axis=0,
82
+ )
83
+
84
+ X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
85
+
86
+ y_train_under = X_y_train_under[column_name] # label only
87
+
88
+ class_balance_default = X_y_train_under[column_name].value_counts()
89
+
90
+ return [
91
+ X_train_under,
92
+ y_train_under,
93
+ X_y_train_under,
94
+ class_balance_default,
95
+ ]
96
+
97
+
98
+ def create_coeffient_feature_dictionary_logistic_model(
99
+ logistic_model, training_data
100
+ ):
101
+ return {
102
+ feat: coef
103
+ for coef, feat in zip(
104
+ logistic_model.coef_[0, :], training_data.columns
105
+ )
106
+ }
107
+
108
+
109
+ @st.cache(suppress_st_warning=True)
110
+ def test_variables_logistic(X_train, y_train):
111
+ # Create and fit the logistic regression model
112
+ return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
113
+
114
+
115
+ @st.cache(suppress_st_warning=True)
116
+ def print_coeff_logistic(clf_logistic_model, split_dataset):
117
+ # Dictionary of features and their coefficients
118
+ return create_coeffient_feature_dictionary_logistic_model(
119
+ clf_logistic_model, split_dataset.X_train
120
+ )
121
+
122
+
123
+ @st.cache(suppress_st_warning=True, hash_funcs={
124
+ xgb.XGBClassifier: pickle.dumps
125
+ })
126
+ def test_variables_gbt(X_train, y_train):
127
+ # Using hyperparameters learning_rate and max_depth
128
+ return xgb.XGBClassifier(
129
+ learning_rate=0.1,
130
+ max_depth=7,
131
+ use_label_encoder=False,
132
+ eval_metric="logloss",
133
+ ).fit(X_train, np.ravel(y_train), eval_metric="logloss")
134
+
135
+
136
+ # In[398]:
137
+
138
+
139
+ def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
140
+ model, X, y, threshold, loan_amount_col_name
141
+ ):
142
+ true_status = y.to_frame()
143
+
144
+ loan_amount = X[loan_amount_col_name]
145
+
146
+ clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
147
+
148
+ clf_prediction_prob_df = pd.DataFrame(
149
+ clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
150
+ )
151
+
152
+ clf_thresh_predicted_default_status = (
153
+ clf_prediction_prob_df["PROB_DEFAULT"]
154
+ .apply(lambda x: 1 if x > threshold else 0)
155
+ .rename("PREDICT_DEFAULT_STATUS")
156
+ )
157
+
158
+ return pd.concat(
159
+ [
160
+ true_status.reset_index(drop=True),
161
+ clf_prediction_prob_df.reset_index(drop=True),
162
+ clf_thresh_predicted_default_status.reset_index(drop=True),
163
+ loan_amount.reset_index(drop=True),
164
+ ],
165
+ axis=1,
166
+ )
167
+
168
+
169
+ def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
170
+ fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
171
+ # get the best threshold
172
+ # Youden’s J statistic tpr-fpr
173
+ # Argmax to get the index in
174
+ # thresholds
175
+ return thresholds[argmax(tpr - fpr)]
176
+
177
+
178
+ # In[399]:
179
+
180
+
181
+ # Function that makes dataframe with probability of default, predicted default status based on threshold
182
+ # and actual default status
183
+
184
+
185
+ def model_probability_values_df(model, X):
186
+ return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
187
+
188
+
189
+ def apply_threshold_to_probability_values(probability_values, threshold):
190
+ return (
191
+ probability_values["PROB_DEFAULT"]
192
+ .apply(lambda x: 1 if x > threshold else 0)
193
+ .rename("PREDICT_DEFAULT_STATUS")
194
+ )
195
+
196
+
197
+ @st.cache(suppress_st_warning=True)
198
+ def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
199
+ fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
200
+ # get the best threshold
201
+ J = tpr - fpr # Youden’s J statistic
202
+ ix = argmax(J)
203
+ return thresholds[ix]
204
+
205
+
206
+ # In[401]:
207
+
208
+
209
+ def create_cross_validation_df(
210
+ X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
211
+ ):
212
+ # Test data x and y
213
+ DTrain = xgb.DMatrix(X, label=y)
214
+
215
+ # auc or logloss
216
+ params = {
217
+ "eval_metric": eval_metric,
218
+ "objective": "binary:logistic", # logistic say 0 or 1 for loan status
219
+ "seed": seed,
220
+ }
221
+
222
+ # Create the data frame of cross validations
223
+ cv_df = xgb.cv(
224
+ params,
225
+ DTrain,
226
+ num_boost_round=trees,
227
+ nfold=n_folds,
228
+ early_stopping_rounds=early_stopping_rounds,
229
+ shuffle=True,
230
+ )
231
+
232
+ return [DTrain, cv_df]
233
+
234
+
235
+ # In[450]:
236
+
237
+
238
+ def cross_validation_scores(model, X, y, nfold, score, seed):
239
+ # return cv scores of metric
240
+ return cross_val_score(
241
+ model,
242
+ np.ascontiguousarray(X),
243
+ np.ravel(np.ascontiguousarray(y)),
244
+ cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
245
+ scoring=score,
246
+ )
247
+
248
+
249
+ def default_status_per_threshold(threshold_list, prob_default):
250
+ threshold_default_status_list = []
251
+ for threshold in threshold_list:
252
+ threshold_default_status = prob_default.apply(
253
+ lambda x: 1 if x > threshold else 0
254
+ )
255
+ threshold_default_status_list.append(threshold_default_status)
256
+ return threshold_default_status_list
257
+
258
+
259
+ def classification_report_per_threshold(
260
+ threshold_list, threshold_default_status_list, y_test
261
+ ):
262
+ target_names = ["Non-Default", "Default"]
263
+ classification_report_list = []
264
+ for threshold_default_status in threshold_default_status_list:
265
+ thresh_classification_report = classification_report(
266
+ y_test,
267
+ threshold_default_status,
268
+ target_names=target_names,
269
+ output_dict=True,
270
+ zero_division=0,
271
+ )
272
+ classification_report_list.append(thresh_classification_report)
273
+ # Return threshold classification report dict
274
+ return dict(zip(threshold_list, classification_report_list))
275
+
276
+
277
+ def thresh_classification_report_recall_accuracy(
278
+ thresh_classification_report_dict,
279
+ ):
280
+ thresh_def_recalls_list = []
281
+ thresh_nondef_recalls_list = []
282
+ thresh_accs_list = []
283
+ for x in [*thresh_classification_report_dict]:
284
+ thresh_def_recall = thresh_classification_report_dict[x]["Default"][
285
+ "recall"
286
+ ]
287
+ thresh_def_recalls_list.append(thresh_def_recall)
288
+ thresh_nondef_recall = thresh_classification_report_dict[x][
289
+ "Non-Default"
290
+ ]["recall"]
291
+ thresh_nondef_recalls_list.append(thresh_nondef_recall)
292
+ thresh_accs = thresh_classification_report_dict[x]["accuracy"]
293
+ thresh_accs_list.append(thresh_accs)
294
+ return [
295
+ thresh_def_recalls_list,
296
+ thresh_nondef_recalls_list,
297
+ thresh_accs_list,
298
+ ]
299
+
300
+
301
+ def create_accept_rate_list(start, end, samples):
302
+ return np.linspace(start, end, samples, endpoint=True)
303
+
304
+
305
+ def create_strategyTable_df(
306
+ start, end, samples, actual_probability_predicted_acc_rate, true, currency
307
+ ):
308
+ accept_rates = create_accept_rate_list(start, end, samples)
309
+ thresholds_strat = []
310
+ bad_rates_start = []
311
+ Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
312
+ num_accepted_loans_start = []
313
+
314
+ for rate in accept_rates:
315
+ # Calculate the threshold for the acceptance rate
316
+ thresh = np.quantile(
317
+ actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
318
+ ).round(3)
319
+ # Add the threshold value to the list of thresholds
320
+ thresholds_strat.append(
321
+ np.quantile(
322
+ actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
323
+ ).round(3)
324
+ )
325
+
326
+ # Reassign the loan_status value using the threshold
327
+ actual_probability_predicted_acc_rate[
328
+ "PREDICT_DEFAULT_STATUS"
329
+ ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
330
+ lambda x: 1 if x > thresh else 0
331
+ )
332
+
333
+ # Create a set of accepted loans using this acceptance rate
334
+ accepted_loans = actual_probability_predicted_acc_rate[
335
+ actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
336
+ == 0
337
+ ]
338
+ # Calculate and append the bad rate using the acceptance rate
339
+ bad_rates_start.append(
340
+ np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
341
+ )
342
+ # Accepted loans
343
+ num_accepted_loans_start.append(len(accepted_loans))
344
+
345
+ # Calculate estimated value
346
+ money_accepted_loans = [
347
+ accepted_loans * Avg_Loan_Amnt
348
+ for accepted_loans in num_accepted_loans_start
349
+ ]
350
+
351
+ money_bad_accepted_loans = [
352
+ 2 * money_accepted_loan * bad_rate
353
+ for money_accepted_loan, bad_rate in zip(
354
+ money_accepted_loans, bad_rates_start
355
+ )
356
+ ]
357
+
358
+ zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
359
+ estimated_value = [
360
+ money_accepted_loan - money_bad_accepted_loan
361
+ for money_accepted_loan, money_bad_accepted_loan in zip_object
362
+ ]
363
+
364
+ accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
365
+
366
+ thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
367
+
368
+ bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
369
+
370
+ estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
371
+
372
+ return (
373
+ pd.DataFrame(
374
+ zip(
375
+ accept_rates,
376
+ thresholds_strat,
377
+ bad_rates_start,
378
+ num_accepted_loans_start,
379
+ estimated_value,
380
+ ),
381
+ columns=[
382
+ "Acceptance Rate",
383
+ "Threshold",
384
+ "Bad Rate",
385
+ "Num Accepted Loans",
386
+ f"Estimated Value ({currency})",
387
+ ],
388
+ )
389
+ .sort_values(by="Acceptance Rate", axis=0, ascending=False)
390
+ .reset_index(drop=True)
391
+ )
common/views.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import OrderedDict
2
+ import streamlit as st # works on command prompt
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ import xgboost as xgb
7
+ from sklearn.metrics import (
8
+ roc_curve,
9
+ )
10
+ from sklearn.calibration import calibration_curve
11
+ from xgboost import plot_tree
12
+ from views.typing import ModelView
13
+
14
+
15
+ def plot_logistic_coeff_barh(coef_dict, x, y):
16
+ fig = plt.figure(figsize=(x, y))
17
+ coef_dict_sorted = dict(
18
+ sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
19
+ )
20
+ plt.barh(*zip(*coef_dict_sorted.items()))
21
+ return fig
22
+
23
+
24
+ def print_negative_coefficients_logistic_model(coef_dict):
25
+ # Equal to or less than 0
26
+ NegativeCoefficients = dict(
27
+ filter(lambda x: x[1] <= 0.0, coef_dict.items())
28
+ )
29
+
30
+ NegativeCoefficientsSorted = sorted(
31
+ NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
32
+ )
33
+ text = (
34
+ "\n\nFeatures the model found to be negatively correlated with probability of default are:"
35
+ "\n{negative_features}:"
36
+ )
37
+ st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
38
+ st.markdown(type(NegativeCoefficientsSorted))
39
+ st.markdown(NegativeCoefficients.items())
40
+
41
+
42
+ def print_positive_coefficients_logistic_model(coef_dict):
43
+ # Equal to or greater than 0
44
+ PositiveCoefficients = dict(
45
+ filter(lambda x: x[1] >= 0.0, coef_dict.items())
46
+ )
47
+
48
+ PositiveCoefficientsSorted = sorted(
49
+ PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
50
+ )
51
+ text = (
52
+ "\n\nFeatures the model found to be positively correlated with probability of default are:"
53
+ "\n{positive_features}:"
54
+ )
55
+ st.markdown(text.format(positive_features=PositiveCoefficientsSorted))
56
+
57
+
58
+ def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
59
+ axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
60
+ fig1 = axobject1.figure
61
+ st.write("Feature Importance Plot (Gradient Boosted Tree)")
62
+ fig1.set_size_inches(barxsize, barysize)
63
+ return fig1
64
+
65
+
66
+ def download_importance_gbt(fig1, barxsize, barysize):
67
+ if st.button(
68
+ "Download Feature Importance Plot as png (Gradient Boosted Tree)"
69
+ ):
70
+ dpisize = max(barxsize, barysize)
71
+ plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
72
+ fig1.set_size_inches(barxsize, barysize)
73
+
74
+
75
+ def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
76
+ plot_tree(clf_gbt_model)
77
+ fig2 = plt.gcf()
78
+ fig2.set_size_inches(treexsize, treeysize)
79
+ return fig2
80
+
81
+
82
+ def download_tree_gbt(treexsize, treeysize):
83
+ if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
84
+ dpisize = max(treexsize, treeysize)
85
+ plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
86
+
87
+
88
+ def cross_validation_graph(cv, eval_metric, trees):
89
+
90
+ # Plot the test AUC scores for each iteration
91
+ fig = plt.figure()
92
+ plt.plot(cv[cv.columns[2]])
93
+ plt.title(
94
+ "Test {eval_metric} Score Over {it_numbr} Iterations".format(
95
+ eval_metric=eval_metric, it_numbr=trees
96
+ )
97
+ )
98
+ plt.xlabel("Iteration Number")
99
+ plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
100
+ return fig
101
+
102
+
103
+ def recall_accuracy_threshold_tradeoff_fig(
104
+ widthsize,
105
+ heightsize,
106
+ threshold_list,
107
+ thresh_def_recalls_list,
108
+ thresh_nondef_recalls_list,
109
+ thresh_accs_list,
110
+ ):
111
+ fig = plt.figure(figsize=(widthsize, heightsize))
112
+ plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
113
+ plt.plot(
114
+ threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
115
+ )
116
+ plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
117
+ plt.xlabel("Probability Threshold")
118
+ plt.ylabel("Score")
119
+ plt.xlim(0, 1)
120
+ plt.ylim(0, 1)
121
+ plt.legend()
122
+ plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
123
+ plt.grid(False)
124
+ return fig
125
+
126
+
127
+ def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
128
+ colors = ["blue", "green"]
129
+ fig = plt.figure()
130
+ for color_idx, (model_name, model_view) in enumerate(model_views.items()):
131
+ fpr, tpr, _thresholds = roc_curve(
132
+ y, model_view.prediction_probability_df
133
+ )
134
+ plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
135
+ plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
136
+ model_names = list(model_views.keys())
137
+ if not model_names:
138
+ model_name_str = "None"
139
+ elif len(model_names) == 1:
140
+ model_name_str = model_names[0]
141
+ else:
142
+ model_name_str = " and ".join(
143
+ [", ".join(model_names[:-1]), model_names[-1]]
144
+ )
145
+ plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
146
+ plt.xlabel("False Positive Rate (FP Rate)")
147
+ plt.ylabel("True Positive Rate (TP Rate)")
148
+ plt.legend()
149
+ plt.grid(False)
150
+ plt.xlim(0, 1)
151
+ plt.ylim(0, 1)
152
+ return fig
153
+
154
+
155
+ def calibration_curve_report_commented_n(
156
+ y, model_views: OrderedDict[str, ModelView], bins: int
157
+ ):
158
+ fig = plt.figure()
159
+ for model_name, model_view in model_views.items():
160
+ frac_of_pos, mean_pred_val = calibration_curve(
161
+ y,
162
+ model_view.prediction_probability_df,
163
+ n_bins=bins,
164
+ normalize=True,
165
+ )
166
+ plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
167
+
168
+ # Create the calibration curve plot with the guideline
169
+ plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
170
+
171
+ plt.ylabel("Fraction of positives")
172
+ plt.xlabel("Average Predicted Probability")
173
+ plt.title("Calibration Curve")
174
+ plt.legend()
175
+ plt.grid(False)
176
+ plt.xlim(0, 1)
177
+ plt.ylim(0, 1)
178
+ return fig
179
+
180
+
181
+ def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
182
+ # Probability distribution
183
+ probability_stat_distribution = probability_default.describe()
184
+
185
+ # Acceptance rate threshold
186
+ acc_rate_thresh = np.quantile(probability_default, acceptancerate)
187
+ fig = plt.figure()
188
+
189
+ plt.hist(
190
+ probability_default,
191
+ color="blue",
192
+ bins=bins,
193
+ histtype="bar",
194
+ ec="white",
195
+ )
196
+
197
+ # Add a reference line to the plot for the threshold
198
+ plt.axvline(x=acc_rate_thresh, color="red")
199
+ plt.title("Acceptance Rate Thershold")
200
+
201
+ return (
202
+ fig,
203
+ probability_stat_distribution,
204
+ acc_rate_thresh,
205
+ )
206
+
207
+
208
+ def streamlit_2columns_metrics_pct_df(
209
+ column1name_label: str,
210
+ column2name_label: str,
211
+ df: pd.DataFrame,
212
+ ):
213
+ (
214
+ column1name,
215
+ column2name,
216
+ ) = st.columns(2)
217
+
218
+ with column1name:
219
+ st.metric(
220
+ label=column1name_label,
221
+ value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
222
+ delta=None,
223
+ delta_color="normal",
224
+ )
225
+
226
+ with column2name:
227
+ st.metric(
228
+ label=column2name_label,
229
+ value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
230
+ delta=None,
231
+ delta_color="normal",
232
+ )
233
+
234
+
235
+ def streamlit_2columns_metrics_df(
236
+ column1name_label: str,
237
+ column2name_label: str,
238
+ df: pd.DataFrame,
239
+ ):
240
+ (
241
+ column1name,
242
+ column2name,
243
+ ) = st.columns(2)
244
+
245
+ with column1name:
246
+ st.metric(
247
+ label=column1name_label,
248
+ value=df.value_counts().get(1),
249
+ delta=None,
250
+ delta_color="normal",
251
+ )
252
+
253
+ with column2name:
254
+ st.metric(
255
+ label=column2name_label,
256
+ value=df.value_counts().get(0),
257
+ delta=None,
258
+ delta_color="normal",
259
+ )
260
+
261
+
262
+ def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
263
+ (
264
+ column1name,
265
+ column2name,
266
+ ) = st.columns(2)
267
+
268
+ with column1name:
269
+ st.metric(
270
+ label="Rows",
271
+ value=df.shape[0],
272
+ delta=None,
273
+ delta_color="normal",
274
+ )
275
+
276
+ with column2name:
277
+ st.metric(
278
+ label="Columns",
279
+ value=df.shape[1],
280
+ delta=None,
281
+ delta_color="normal",
282
+ )
283
+
284
+
285
+ def streamlit_2columns_metrics_pct_series(
286
+ column1name_label: str,
287
+ column2name_label: str,
288
+ series: pd.Series,
289
+ ):
290
+ (
291
+ column1name,
292
+ column2name,
293
+ ) = st.columns(2)
294
+ with column1name:
295
+ st.metric(
296
+ label=column1name_label,
297
+ value="{:.0%}".format(series.get(1) / series.sum()),
298
+ delta=None,
299
+ delta_color="normal",
300
+ )
301
+
302
+ with column2name:
303
+ st.metric(
304
+ label=column2name_label,
305
+ value="{:.0%}".format(series.get(0) / series.sum()),
306
+ delta=None,
307
+ delta_color="normal",
308
+ )
309
+
310
+
311
+ def streamlit_2columns_metrics_series(
312
+ column1name_label: str,
313
+ column2name_label: str,
314
+ series: pd.Series,
315
+ ):
316
+ (
317
+ column1name,
318
+ column2name,
319
+ ) = st.columns(2)
320
+ with column1name:
321
+ st.metric(
322
+ label=column1name_label,
323
+ value=series.get(1),
324
+ delta=None,
325
+ delta_color="normal",
326
+ )
327
+
328
+ with column2name:
329
+ st.metric(
330
+ label=column2name_label,
331
+ value=series.get(0),
332
+ delta=None,
333
+ delta_color="normal",
334
+ )
335
+
336
+
337
+ def streamlit_chart_setting_height_width(
338
+ title: str,
339
+ default_widthvalue: int,
340
+ default_heightvalue: int,
341
+ widthkey: str,
342
+ heightkey: str,
343
+ ):
344
+ with st.expander(title):
345
+
346
+ lbarx_col, lbary_col = st.columns(2)
347
+
348
+ with lbarx_col:
349
+ width_size = st.number_input(
350
+ label="Width in inches:",
351
+ value=default_widthvalue,
352
+ key=widthkey,
353
+ )
354
+
355
+ with lbary_col:
356
+ height_size = st.number_input(
357
+ label="Height in inches:",
358
+ value=default_heightvalue,
359
+ key=heightkey,
360
+ )
361
+ return width_size, height_size
src/features/build_features.py β†’ data_setup.py RENAMED
@@ -1,19 +1,13 @@
1
- from typing import List, Union, cast, Tuple
2
- from dataclasses import dataclass
3
- from sklearn.model_selection import train_test_split
4
- import pandas as pd
5
 
 
6
  import streamlit as st
7
 
8
-
9
- from features.util_build_features import (
10
- Dataset,
11
- SplitDataset,
12
  undersample_training_data,
13
- select_predictors,
14
- import_data)
15
-
16
- from visualization.metrics import (
17
  streamlit_2columns_metrics_df_shape,
18
  streamlit_2columns_metrics_series,
19
  streamlit_2columns_metrics_pct_series,
@@ -22,9 +16,22 @@ from visualization.metrics import (
22
  )
23
 
24
 
 
25
  def initialise_data() -> Tuple[Dataset, SplitDataset]:
26
-
27
- dataset = import_data()
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  st.write(
30
  "Assuming data is already cleaned and relevant features (predictors) added."
@@ -34,12 +41,31 @@ def initialise_data() -> Tuple[Dataset, SplitDataset]:
34
  st.dataframe(dataset.df)
35
  streamlit_2columns_metrics_df_shape(dataset.df)
36
 
37
- selected_x_values = select_predictors(dataset)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  with st.expander("Predictors Dataframe (X)"):
40
  st.dataframe(selected_x_values)
41
  streamlit_2columns_metrics_df_shape(selected_x_values)
42
 
 
 
 
43
  st.header("Split Testing and Training Data")
44
 
45
  test_size_slider_col, seed_col = st.columns(2)
@@ -62,6 +88,7 @@ def initialise_data() -> Tuple[Dataset, SplitDataset]:
62
 
63
  split_dataset = dataset.train_test_split(selected_x_values)
64
 
 
65
  true_status = split_dataset.y_test.to_frame().value_counts()
66
 
67
  st.sidebar.metric(
 
1
+ from typing import Tuple, cast
 
 
 
2
 
3
+ import pandas as pd
4
  import streamlit as st
5
 
6
+ from common.data import Dataset, SplitDataset
7
+ from common.util import (
 
 
8
  undersample_training_data,
9
+ )
10
+ from common.views import (
 
 
11
  streamlit_2columns_metrics_df_shape,
12
  streamlit_2columns_metrics_series,
13
  streamlit_2columns_metrics_pct_series,
 
16
  )
17
 
18
 
19
+ # Initialize dataframe session state
20
  def initialise_data() -> Tuple[Dataset, SplitDataset]:
21
+ if "input_data_frame" not in st.session_state:
22
+ st.session_state.input_data_frame = pd.read_csv(
23
+ r"./data/processed/cr_loan_w2.csv"
24
+ )
25
+ if "dataset" not in st.session_state:
26
+ df = cast(pd.DataFrame, st.session_state.input_data_frame)
27
+ dataset = Dataset(
28
+ df=df,
29
+ random_state=123235,
30
+ test_size=40,
31
+ )
32
+ st.session_state.dataset = dataset
33
+ else:
34
+ dataset = st.session_state.dataset
35
 
36
  st.write(
37
  "Assuming data is already cleaned and relevant features (predictors) added."
 
41
  st.dataframe(dataset.df)
42
  streamlit_2columns_metrics_df_shape(dataset.df)
43
 
44
+ st.header("Predictors")
45
 
46
+ possible_columns = dataset.x_values_column_names
47
+
48
+ selected_columns = st.sidebar.multiselect(
49
+ label="Select Predictors",
50
+ options=possible_columns,
51
+ default=possible_columns,
52
+ )
53
+
54
+ selected_x_values = dataset.x_values_filtered_columns(selected_columns)
55
+
56
+ st.sidebar.metric(
57
+ label="# of Predictors Selected",
58
+ value=selected_x_values.shape[1],
59
+ delta=None,
60
+ delta_color="normal",
61
+ )
62
  with st.expander("Predictors Dataframe (X)"):
63
  st.dataframe(selected_x_values)
64
  streamlit_2columns_metrics_df_shape(selected_x_values)
65
 
66
+ # 40% of data used for training
67
+ # 14321 as random seed for reproducability
68
+
69
  st.header("Split Testing and Training Data")
70
 
71
  test_size_slider_col, seed_col = st.columns(2)
 
88
 
89
  split_dataset = dataset.train_test_split(selected_x_values)
90
 
91
+ # Series
92
  true_status = split_dataset.y_test.to_frame().value_counts()
93
 
94
  st.sidebar.metric(
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
setup.py DELETED
@@ -1,10 +0,0 @@
1
- from setuptools import find_packages, setup
2
-
3
- setup(
4
- name='src',
5
- packages=find_packages(),
6
- version='0.1.0',
7
- description='Tool for credit risk modelling',
8
- author='Author',
9
- license='MIT',
10
- )
 
 
 
 
 
 
 
 
 
 
 
setup.sh DELETED
@@ -1,13 +0,0 @@
1
- mkdir -p ~/.streamlit/
2
-
3
- cat << EOF > ~/.streamlit/credentials.toml
4
- [general]
5
- email = "paul.r.kiage@gmail.com"
6
- EOF
7
-
8
- cat << EOF > ~/.streamlit/config.toml
9
- [server]
10
- headless = true
11
- enableCORS = true
12
- port = $PORT
13
- EOF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/__main__.py DELETED
File without changes
src/models/__init__.py DELETED
File without changes
src/models/logistic_model.py DELETED
@@ -1,33 +0,0 @@
1
- from features.build_features import SplitDataset
2
-
3
- from models.logistic_train_model import logistic_train_model
4
- from models.logistic_predict_model import logistic_predict_model
5
- from models.logistic_test_model import logistic_test_model
6
-
7
- from models.util_model_class import ModelClass
8
-
9
-
10
- def logistic_class(split_dataset: SplitDataset, currency: str) -> ModelClass:
11
-
12
- # Train Model
13
- clf_logistic_model = logistic_train_model(split_dataset)
14
-
15
- # Predict using Trained Model
16
- clf_logistic_predictions = logistic_predict_model(
17
- clf_logistic_model, split_dataset)
18
-
19
- # Test and Evaluate Model
20
- df_trueStatus_probabilityDefault_threshStatus_loanAmount_logistic = logistic_test_model(
21
- clf_logistic_model,
22
- split_dataset,
23
- currency,
24
- clf_logistic_predictions.probability_threshold_selected,
25
- clf_logistic_predictions.predicted_default_status)
26
-
27
- return ModelClass(
28
- model=clf_logistic_model,
29
- trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount_logistic,
30
- probability_threshold_selected=clf_logistic_predictions.probability_threshold_selected,
31
- predicted_default_status=clf_logistic_predictions.predicted_default_status,
32
- prediction_probability_df=clf_logistic_predictions.prediction_probability_df,
33
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/logistic_predict_model.py DELETED
@@ -1,4 +0,0 @@
1
- from models.util_predict_model import make_prediction_view
2
-
3
- logistic_predict_model = make_prediction_view(
4
- "Logistic", "Logisitic Model")
 
 
 
 
 
src/models/logistic_test_model.py DELETED
@@ -1,4 +0,0 @@
1
- from models.util_test import make_tests_view
2
-
3
- logistic_test_model = make_tests_view(
4
- "Logistic", "Logistic Model")
 
 
 
 
 
src/models/logistic_train_model.py DELETED
@@ -1,69 +0,0 @@
1
-
2
- import numpy as np
3
- from sklearn.linear_model import LogisticRegression
4
- from features.build_features import SplitDataset
5
- import streamlit as st
6
- import pandas as pd
7
-
8
- from visualization.graphs_logistic import plot_logistic_coeff_barh
9
-
10
-
11
- @st.cache(suppress_st_warning=True)
12
- def create_clf_logistic_model(X_train, y_train):
13
- # Create and fit the logistic regression model
14
- return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
15
-
16
-
17
- @st.cache(suppress_st_warning=True)
18
- def create_coeff_dict_logistic_model(
19
- logistic_model, training_data
20
- ):
21
- return {
22
- feat: coef
23
- for coef, feat in zip(
24
- logistic_model.coef_[0, :], training_data.columns
25
- )
26
- }
27
-
28
-
29
- def coeff_dict_to_sorted_df(coef_dict):
30
- coef_dict_sorted = dict(
31
- sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
32
- )
33
-
34
- data_items = coef_dict_sorted.items()
35
- data_list = list(data_items)
36
-
37
- return pd.DataFrame(data_list, columns=["Coefficient", "Value"])
38
-
39
-
40
- def interpret_clf_logistic_model(clf_logistic_model, split_dataset):
41
- st.metric(
42
- label="# of Coefficients in Logistic Regression",
43
- value=clf_logistic_model.n_features_in_,
44
- delta=None,
45
- delta_color="normal",
46
- )
47
-
48
- st.subheader("Logistic Regression Coefficient Values")
49
-
50
- coef_dict = create_coeff_dict_logistic_model(
51
- clf_logistic_model, split_dataset.X_y_train)
52
-
53
- df = coeff_dict_to_sorted_df(coef_dict)
54
-
55
- fig = plot_logistic_coeff_barh(df)
56
-
57
- st.plotly_chart(fig)
58
-
59
-
60
- def logistic_train_model(split_dataset: SplitDataset):
61
- st.header("Logistic Regression Model")
62
-
63
- clf_logistic_model = create_clf_logistic_model(
64
- split_dataset.X_train, split_dataset.y_train
65
- )
66
-
67
- interpret_clf_logistic_model(clf_logistic_model, split_dataset)
68
-
69
- return clf_logistic_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/util_predict_model.py DELETED
@@ -1,87 +0,0 @@
1
- from typing import Union, cast
2
- from sklearn.linear_model import LogisticRegression
3
-
4
-
5
- import pandas as pd
6
-
7
- from dataclasses import dataclass
8
-
9
- from xgboost import XGBClassifier
10
- from features.util_build_features import SplitDataset
11
-
12
- from models.util_predict_model_threshold import (
13
- user_defined_probability_threshold,
14
- J_statistic_driven_probability_threshold,
15
- tradeoff_threshold,
16
- acceptance_rate_driven_threshold,
17
- select_probability_threshold,
18
- model_probability_values_df)
19
-
20
- import streamlit as st
21
-
22
-
23
- def probability_threshold_explainer(model_name):
24
- st.write(
25
- f"""
26
- The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
27
- Probabilities of defaulting of the loans are compared to a probability threshold.\n
28
- A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
29
- """
30
- )
31
-
32
-
33
- @dataclass(frozen=True)
34
- class Threshold:
35
- probability_threshold_selected: float
36
- predicted_default_status: pd.Series
37
- prediction_probability_df: pd.DataFrame
38
-
39
-
40
- def make_prediction_view(
41
- model_name_short: str,
42
- model_name: str,
43
- ):
44
- def view(
45
- clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
46
- split_dataset: SplitDataset,
47
- ) -> Threshold:
48
-
49
- probability_threshold_explainer(model_name)
50
-
51
- clf_prediction_prob_df_gbt = model_probability_values_df(
52
- clf_xgbt_model,
53
- split_dataset.X_test,
54
- )
55
-
56
- (clf_thresh_predicted_default_status_user_gbt,
57
- user_threshold
58
- ) = user_defined_probability_threshold(
59
- model_name_short, clf_xgbt_model, split_dataset)
60
-
61
- (clf_thresh_predicted_default_status_Jstatistic_gbt,
62
- J_statistic_best_threshold) = J_statistic_driven_probability_threshold(
63
- clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset)
64
-
65
- tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset)
66
-
67
- (acc_rate_thresh_gbt,
68
- clf_thresh_predicted_default_status_acceptance_gbt) = acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt)
69
-
70
- (prob_thresh_selected_gbt,
71
- predicted_default_status_gbt) = select_probability_threshold(model_name_short,
72
- user_threshold,
73
- clf_thresh_predicted_default_status_user_gbt,
74
- J_statistic_best_threshold,
75
- clf_thresh_predicted_default_status_Jstatistic_gbt,
76
- acc_rate_thresh_gbt,
77
- clf_thresh_predicted_default_status_acceptance_gbt)
78
-
79
- return Threshold(
80
- probability_threshold_selected=cast(
81
- float, prob_thresh_selected_gbt
82
- ),
83
- predicted_default_status=predicted_default_status_gbt,
84
- prediction_probability_df=clf_prediction_prob_df_gbt,
85
- )
86
-
87
- return view
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/util_predict_model_threshold.py DELETED
@@ -1,310 +0,0 @@
1
- import streamlit as st
2
-
3
- from sklearn.metrics import classification_report, roc_curve
4
-
5
- import numpy as np
6
-
7
- import plotly.express as px
8
-
9
- import pandas as pd
10
-
11
- from numpy import argmax
12
-
13
- from visualization.metrics import streamlit_2columns_metrics_df, streamlit_2columns_metrics_pct_df
14
-
15
- from visualization.graphs_threshold import acceptance_rate_driven_threshold_graph
16
-
17
-
18
- def model_probability_values_df(model, X):
19
- return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
20
-
21
-
22
- def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
23
- fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
24
- # get the best threshold
25
- # Youden’s J statistic tpr-fpr
26
- # Argmax to get the index in
27
- # thresholds
28
- return thresholds[argmax(tpr - fpr)]
29
-
30
- # Function that makes dataframe with probability of default, predicted default status based on threshold
31
- # and actual default status
32
-
33
-
34
- def classification_report_per_threshold(
35
- threshold_list, threshold_default_status_list, y_test
36
- ):
37
- target_names = ["Non-Default", "Default"]
38
- classification_report_list = []
39
- for threshold_default_status in threshold_default_status_list:
40
- thresh_classification_report = classification_report(
41
- y_test,
42
- threshold_default_status,
43
- target_names=target_names,
44
- output_dict=True,
45
- zero_division=0,
46
- )
47
- classification_report_list.append(thresh_classification_report)
48
- # Return threshold classification report dict
49
- return dict(zip(threshold_list, classification_report_list))
50
-
51
-
52
- def thresh_classification_report_recall_accuracy(
53
- thresh_classification_report_dict,
54
- ):
55
- thresh_def_recalls_list = []
56
- thresh_nondef_recalls_list = []
57
- thresh_accs_list = []
58
- for x in [*thresh_classification_report_dict]:
59
- thresh_def_recall = thresh_classification_report_dict[x]["Default"][
60
- "recall"
61
- ]
62
- thresh_def_recalls_list.append(thresh_def_recall)
63
- thresh_nondef_recall = thresh_classification_report_dict[x][
64
- "Non-Default"
65
- ]["recall"]
66
- thresh_nondef_recalls_list.append(thresh_nondef_recall)
67
- thresh_accs = thresh_classification_report_dict[x]["accuracy"]
68
- thresh_accs_list.append(thresh_accs)
69
- return [
70
- thresh_def_recalls_list,
71
- thresh_nondef_recalls_list,
72
- thresh_accs_list,
73
- ]
74
-
75
-
76
- def apply_threshold_to_probability_values(probability_values, threshold):
77
- return (
78
- probability_values["PROB_DEFAULT"]
79
- .apply(lambda x: 1 if x > threshold else 0)
80
- .rename("PREDICT_DEFAULT_STATUS")
81
- )
82
-
83
-
84
- @st.cache(suppress_st_warning=True)
85
- def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
86
- fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
87
- # get the best threshold
88
- J = tpr - fpr # Youden’s J statistic
89
- ix = argmax(J)
90
- return thresholds[ix]
91
-
92
-
93
- def default_status_per_threshold(threshold_list, prob_default):
94
- threshold_default_status_list = []
95
- for threshold in threshold_list:
96
- threshold_default_status = prob_default.apply(
97
- lambda x: 1 if x > threshold else 0
98
- )
99
- threshold_default_status_list.append(threshold_default_status)
100
- return threshold_default_status_list
101
-
102
-
103
- def threshold_and_predictions(clf_xgbt_model, split_dataset, threshold):
104
-
105
- clf_prediction_prob_df_gbt = model_probability_values_df(
106
- clf_xgbt_model,
107
- split_dataset.X_test,
108
- )
109
- clf_thresh_predicted_default_status = (
110
- apply_threshold_to_probability_values(
111
- clf_prediction_prob_df_gbt,
112
- threshold,
113
- )
114
- )
115
-
116
- streamlit_2columns_metrics_df(
117
- "# of Predicted Defaults",
118
- "# of Predicted Non-Default",
119
- clf_thresh_predicted_default_status,
120
- )
121
-
122
- streamlit_2columns_metrics_pct_df(
123
- "% of Loans Predicted to Default",
124
- "% of Loans Predicted not to Default",
125
- clf_thresh_predicted_default_status,
126
- )
127
-
128
- return clf_thresh_predicted_default_status
129
-
130
-
131
- def user_defined_probability_threshold(model_name_short, clf_xgbt_model, split_dataset):
132
- st.subheader("Classification Probability Threshold - User Defined")
133
-
134
- user_defined_threshold = st.slider(
135
- label="Default Probability Threshold:",
136
- min_value=0.0,
137
- max_value=1.0,
138
- value=0.8,
139
- key=f"threshold_{model_name_short}_default",
140
- )
141
-
142
- clf_thresh_predicted_default_status = threshold_and_predictions(
143
- clf_xgbt_model, split_dataset, user_defined_threshold)
144
-
145
- return clf_thresh_predicted_default_status, user_defined_threshold
146
-
147
-
148
- def J_statistic_driven_probability_threshold(clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset):
149
- st.subheader("J Statistic Driven Classification Probability Threshold")
150
-
151
- J_statistic_best_threshold = find_best_threshold_J_statistic(
152
- split_dataset.y_test, clf_prediction_prob_df_gbt
153
- )
154
- st.metric(
155
- label="Youden's J statistic calculated best threshold",
156
- value=J_statistic_best_threshold,
157
- )
158
-
159
- clf_thresh_predicted_default_status = threshold_and_predictions(
160
- clf_xgbt_model, split_dataset, J_statistic_best_threshold)
161
-
162
- return clf_thresh_predicted_default_status, J_statistic_best_threshold
163
-
164
-
165
- def create_tradeoff_graph(df):
166
- fig2 = px.line(
167
- data_frame=df,
168
- y=["Default Recall", "Non Default Recall", "Accuracy"],
169
- x="Threshold",
170
- )
171
-
172
- fig2.update_layout(
173
- title="Recall and Accuracy score Trade-off with Probability Threshold",
174
- xaxis_title="Probability Threshold",
175
- yaxis_title="Score",
176
- )
177
- fig2.update_yaxes(range=[0.0, 1.0])
178
-
179
- st.plotly_chart(fig2)
180
-
181
-
182
- def tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset):
183
- st.subheader(
184
- "Recall and Accuracy Tradeoff with given Probability Threshold"
185
- )
186
-
187
- threshold_list = np.arange(
188
- 0, 1, 0.025).round(decimals=3).tolist()
189
-
190
- threshold_default_status_list = default_status_per_threshold(
191
- threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
192
- )
193
- thresh_classification_report_dict = (
194
- classification_report_per_threshold(
195
- threshold_list,
196
- threshold_default_status_list,
197
- split_dataset.y_test,
198
- )
199
- )
200
-
201
- (
202
- thresh_def_recalls_list,
203
- thresh_nondef_recalls_list,
204
- thresh_accs_list,
205
- ) = thresh_classification_report_recall_accuracy(
206
- thresh_classification_report_dict
207
- )
208
-
209
- namelist = [
210
- "Default Recall",
211
- "Non Default Recall",
212
- "Accuracy",
213
- "Threshold",
214
- ]
215
-
216
- df = pd.DataFrame(
217
- [
218
- thresh_def_recalls_list,
219
- thresh_nondef_recalls_list,
220
- thresh_accs_list,
221
- threshold_list,
222
- ],
223
- index=namelist,
224
- )
225
-
226
- df = df.T
227
-
228
- create_tradeoff_graph(df)
229
-
230
-
231
- def select_probability_threshold(model_name_short,
232
- user_defined_threshold,
233
- clf_thresh_predicted_default_status_user_gbt,
234
- J_statistic_best_threshold,
235
- clf_thresh_predicted_default_status_Jstatistic_gbt,
236
- acc_rate_thresh_gbt,
237
- clf_thresh_predicted_default_status_acceptance_gbt):
238
- st.subheader("Selected Probability Threshold")
239
-
240
- options = [
241
- "User Defined",
242
- "J Statistic Driven",
243
- "Acceptance Rate Driven",
244
- ]
245
- prob_thresh_option = st.radio(
246
- label="Selected Probability Threshold",
247
- options=options,
248
- key=f"{model_name_short}_radio_thresh",
249
- )
250
-
251
- if prob_thresh_option == "User Defined":
252
- prob_thresh_selected_gbt = user_defined_threshold
253
- predicted_default_status_gbt = (
254
- clf_thresh_predicted_default_status_user_gbt
255
- )
256
- elif prob_thresh_option == "J Statistic Driven":
257
- prob_thresh_selected_gbt = J_statistic_best_threshold
258
- predicted_default_status_gbt = (
259
- clf_thresh_predicted_default_status_Jstatistic_gbt
260
- )
261
- else:
262
- prob_thresh_selected_gbt = acc_rate_thresh_gbt
263
- predicted_default_status_gbt = (
264
- clf_thresh_predicted_default_status_acceptance_gbt
265
- )
266
-
267
- st.write(
268
- f"Selected probability threshold is {prob_thresh_selected_gbt}"
269
- )
270
-
271
- return prob_thresh_selected_gbt, predicted_default_status_gbt
272
-
273
-
274
- def acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt):
275
- st.subheader("Acceptance Rate Driven Probability Threshold")
276
- # Steps
277
- # Set acceptance rate
278
- # Get default status per threshold
279
- # Get classification report per threshold
280
- # Get recall, nondef recall, and accuracy per threshold
281
-
282
- acceptance_rate = (
283
- st.slider(
284
- label="% of loans accepted (acceptance rate):",
285
- min_value=0,
286
- max_value=100,
287
- value=85,
288
- key=f"acceptance_rate_{model_name_short}",
289
- format="%f%%",
290
- )
291
- / 100
292
- )
293
-
294
- acc_rate_thresh_gbt = np.quantile(
295
- clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
296
- )
297
-
298
- st.write(
299
- f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
300
- )
301
-
302
- acceptance_rate_driven_threshold_graph(
303
- clf_prediction_prob_df_gbt, acc_rate_thresh_gbt)
304
-
305
- clf_thresh_predicted_default_status_acceptance_gbt = apply_threshold_to_probability_values(
306
- clf_prediction_prob_df_gbt,
307
- acc_rate_thresh_gbt,
308
- )
309
-
310
- return acc_rate_thresh_gbt, clf_thresh_predicted_default_status_acceptance_gbt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/xgboost_model.py DELETED
@@ -1,33 +0,0 @@
1
- from features.build_features import SplitDataset
2
-
3
- from models.xgboost_train_model import xgboost_train_model
4
- from models.xgboost_predict_model import xgboost_predit_model
5
- from models.xgboost_test_model import xgboost_test_model
6
-
7
- from models.util_model_class import ModelClass
8
-
9
-
10
- def xgboost_class(split_dataset: SplitDataset, currency: str):
11
-
12
- # Train Model
13
- clf_xgbt_model = xgboost_train_model(split_dataset)
14
-
15
- # Predit using Trained Model
16
- clf_xgbt_predictions = xgboost_predit_model(
17
- clf_xgbt_model, split_dataset)
18
-
19
- # Test and Evaluate Model
20
- df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt = xgboost_test_model(
21
- clf_xgbt_model,
22
- split_dataset,
23
- currency,
24
- clf_xgbt_predictions.probability_threshold_selected,
25
- clf_xgbt_predictions.predicted_default_status)
26
-
27
- return ModelClass(
28
- model=clf_xgbt_model,
29
- trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt,
30
- probability_threshold_selected=clf_xgbt_predictions.probability_threshold_selected,
31
- predicted_default_status=clf_xgbt_predictions.predicted_default_status,
32
- prediction_probability_df=clf_xgbt_predictions.prediction_probability_df,
33
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/xgboost_predict_model.py DELETED
@@ -1,4 +0,0 @@
1
- from models.util_predict_model import make_prediction_view
2
-
3
- xgboost_predit_model = make_prediction_view(
4
- "XGBoost", "Gradient Boosted Tree with XGBoost")
 
 
 
 
 
src/models/xgboost_test_model.py DELETED
@@ -1,4 +0,0 @@
1
- from models.util_test import make_tests_view
2
-
3
- xgboost_test_model = make_tests_view(
4
- "XGBoost", "Gradient Boosted Tree with XGBoost")
 
 
 
 
 
src/models/xgboost_train_model.py DELETED
@@ -1,68 +0,0 @@
1
- import pickle
2
-
3
- import numpy as np
4
- import xgboost as xgb
5
- from features.build_features import SplitDataset
6
- import streamlit as st
7
-
8
- from visualization.graphs_decision_tree import(plot_importance_gbt,
9
- plot_tree_gbt)
10
-
11
- from visualization.graphs_settings import streamlit_chart_setting_height_width
12
-
13
- from visualization.graphs_download import (download_importance_gbt,
14
- download_tree_gbt)
15
-
16
-
17
- @ st.cache(suppress_st_warning=True, hash_funcs={
18
- xgb.XGBClassifier: pickle.dumps
19
- })
20
- def create_clf_xgbt_model(X_train, y_train):
21
- # Using hyperparameters learning_rate and max_depth
22
- return xgb.XGBClassifier(
23
- learning_rate=0.1,
24
- max_depth=7,
25
- use_label_encoder=False,
26
- eval_metric="logloss",
27
- ).fit(X_train, np.ravel(y_train), eval_metric="logloss")
28
-
29
-
30
- def interpret_clf_xgbt_model(clf_xgbt_model):
31
- st.subheader("XGBoost Decision Tree Feature Importance")
32
-
33
- (barxsize, barysize,) = streamlit_chart_setting_height_width(
34
- "Chart Settings", 10, 15, "barxsize", "barysize"
35
- )
36
-
37
- fig1 = plot_importance_gbt(clf_xgbt_model, barxsize, barysize)
38
-
39
- st.pyplot(fig1)
40
-
41
- download_importance_gbt(fig1, barxsize, barysize)
42
-
43
- st.subheader("XGBoost Decision Tree Structure")
44
-
45
- (treexsize, treeysize,) = streamlit_chart_setting_height_width(
46
- "Chart Settings", 5, 5, "treexsize", "treeysize"
47
- )
48
-
49
- fig2 = plot_tree_gbt(treexsize, treeysize, clf_xgbt_model)
50
-
51
- st.pyplot(fig2)
52
-
53
- download_tree_gbt(treexsize, treeysize)
54
- st.markdown(
55
- "Note: The downloaded XGBoost Decision Tree plot chart in png has higher resolution than that displayed here."
56
- )
57
-
58
-
59
- def xgboost_train_model(split_dataset: SplitDataset):
60
- st.header("XGBoost Decision Trees")
61
-
62
- clf_xgbt_model = create_clf_xgbt_model(
63
- split_dataset.X_train, split_dataset.y_train
64
- )
65
-
66
- interpret_clf_xgbt_model(clf_xgbt_model)
67
-
68
- return clf_xgbt_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/visualization/__init__.py DELETED
File without changes
src/visualization/graphs_decision_tree.py DELETED
@@ -1,23 +0,0 @@
1
-
2
- import xgboost as xgb
3
-
4
- import streamlit as st
5
-
6
- import matplotlib.pyplot as plt
7
-
8
- from xgboost import plot_tree
9
-
10
-
11
- def plot_importance_gbt(clf_xgbt_model, barxsize, barysize):
12
- axobject1 = xgb.plot_importance(clf_xgbt_model, importance_type="weight")
13
- fig1 = axobject1.figure
14
- st.write("Feature Importance Plot (Gradient Boosted Tree)")
15
- fig1.set_size_inches(barxsize, barysize)
16
- return fig1
17
-
18
-
19
- def plot_tree_gbt(treexsize, treeysize, clf_xgbt_model):
20
- plot_tree(clf_xgbt_model)
21
- fig2 = plt.gcf()
22
- fig2.set_size_inches(treexsize, treeysize)
23
- return fig2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/visualization/graphs_download.py DELETED
@@ -1,17 +0,0 @@
1
- import streamlit as st
2
- import matplotlib.pyplot as plt
3
-
4
-
5
- def download_importance_gbt(fig1, barxsize, barysize):
6
- if st.button(
7
- "Download Feature Importance Plot as png (Gradient Boosted Tree)"
8
- ):
9
- dpisize = max(barxsize, barysize)
10
- plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
11
- fig1.set_size_inches(barxsize, barysize)
12
-
13
-
14
- def download_tree_gbt(treexsize, treeysize):
15
- if st.button("Download XGBoost Decision Tree Plot as png (Gradient Boosted Tree)"):
16
- dpisize = max(treexsize, treeysize)
17
- plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/visualization/graphs_logistic.py DELETED
@@ -1,12 +0,0 @@
1
- import plotly.express as px
2
-
3
-
4
- def plot_logistic_coeff_barh(df):
5
- fig = px.bar(data_frame=df, x="Value",
6
- y="Coefficient", orientation="h")
7
-
8
- fig.update_layout(
9
- title="Logistic Regression Coefficients",
10
- xaxis_title="Value",
11
- yaxis_title="Coefficient",)
12
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
src/visualization/graphs_settings.py DELETED
@@ -1,28 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- def streamlit_chart_setting_height_width(
5
- title: str,
6
- default_widthvalue: int,
7
- default_heightvalue: int,
8
- widthkey: str,
9
- heightkey: str,
10
- ):
11
- with st.expander(title):
12
-
13
- lbarx_col, lbary_col = st.columns(2)
14
-
15
- with lbarx_col:
16
- width_size = st.number_input(
17
- label="Width in inches:",
18
- value=default_widthvalue,
19
- key=widthkey,
20
- )
21
-
22
- with lbary_col:
23
- height_size = st.number_input(
24
- label="Height in inches:",
25
- value=default_heightvalue,
26
- key=heightkey,
27
- )
28
- return width_size, height_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/visualization/graphs_test.py DELETED
@@ -1,78 +0,0 @@
1
- from matplotlib import pyplot as plt
2
-
3
- from sklearn.metrics import roc_curve
4
-
5
- from typing import OrderedDict
6
-
7
- from models.util_model_class import ModelClass
8
-
9
- from sklearn.calibration import calibration_curve
10
-
11
-
12
- def cross_validation_graph(cv, eval_metric, trees):
13
-
14
- # Plot the test AUC scores for each iteration
15
- fig = plt.figure()
16
- plt.plot(cv[cv.columns[2]])
17
- plt.title(
18
- "Test {eval_metric} Score Over {it_numbr} Iterations".format(
19
- eval_metric=eval_metric, it_numbr=trees
20
- )
21
- )
22
- plt.xlabel("Iteration Number")
23
- plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
24
- return fig
25
-
26
-
27
- def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelClass]):
28
- colors = ["blue", "green"]
29
- fig = plt.figure()
30
- for color_idx, (model_name, model_view) in enumerate(model_views.items()):
31
- fpr, tpr, _thresholds = roc_curve(
32
- y, model_view.prediction_probability_df
33
- )
34
- plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
35
- plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
36
- model_names = list(model_views.keys())
37
- if not model_names:
38
- model_name_str = "None"
39
- elif len(model_names) == 1:
40
- model_name_str = model_names[0]
41
- else:
42
- model_name_str = " and ".join(
43
- [", ".join(model_names[:-1]), model_names[-1]]
44
- )
45
- plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
46
- plt.xlabel("False Positive Rate (FP Rate)")
47
- plt.ylabel("True Positive Rate (TP Rate)")
48
- plt.legend()
49
- plt.grid(False)
50
- plt.xlim(0, 1)
51
- plt.ylim(0, 1)
52
- return fig
53
-
54
-
55
- def calibration_curve_report_commented_n(
56
- y, model_views: OrderedDict[str, ModelClass], bins: int
57
- ):
58
- fig = plt.figure()
59
- for model_name, model_view in model_views.items():
60
- frac_of_pos, mean_pred_val = calibration_curve(
61
- y,
62
- model_view.prediction_probability_df,
63
- n_bins=bins,
64
- normalize=True,
65
- )
66
- plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
67
-
68
- # Create the calibration curve plot with the guideline
69
- plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
70
-
71
- plt.ylabel("Fraction of positives")
72
- plt.xlabel("Average Predicted Probability")
73
- plt.title("Calibration Curve")
74
- plt.legend()
75
- plt.grid(False)
76
- plt.xlim(0, 1)
77
- plt.ylim(0, 1)
78
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/visualization/graphs_threshold.py DELETED
@@ -1,80 +0,0 @@
1
-
2
- import plotly.express as px
3
-
4
- import streamlit as st
5
-
6
- import matplotlib.pyplot as plt
7
-
8
- import numpy as np
9
-
10
-
11
- def acceptance_rate_driven_threshold_graph(clf_prediction_prob_df_gbt, acc_rate_thresh_gbt):
12
- figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
13
-
14
- figa.update_layout(
15
- title="Acceptance Rate Threshold vs. Loans Accepted",
16
- xaxis_title="Acceptance Rate Threshold",
17
- yaxis_title="Loans Accepted",
18
- )
19
-
20
- figa.update_traces(marker_line_width=1, marker_line_color="white")
21
-
22
- figa.add_vline(
23
- x=acc_rate_thresh_gbt,
24
- line_width=3,
25
- line_dash="solid",
26
- line_color="red",
27
- )
28
-
29
- st.plotly_chart(figa)
30
-
31
-
32
- def recall_accuracy_threshold_tradeoff_fig(
33
- widthsize,
34
- heightsize,
35
- threshold_list,
36
- thresh_def_recalls_list,
37
- thresh_nondef_recalls_list,
38
- thresh_accs_list,
39
- ):
40
- fig = plt.figure(figsize=(widthsize, heightsize))
41
- plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
42
- plt.plot(
43
- threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
44
- )
45
- plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
46
- plt.xlabel("Probability Threshold")
47
- plt.ylabel("Score")
48
- plt.xlim(0, 1)
49
- plt.ylim(0, 1)
50
- plt.legend()
51
- plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
52
- plt.grid(False)
53
- return fig
54
-
55
-
56
- def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
57
- # Probability distribution
58
- probability_stat_distribution = probability_default.describe()
59
-
60
- # Acceptance rate threshold
61
- acc_rate_thresh = np.quantile(probability_default, acceptancerate)
62
- fig = plt.figure()
63
-
64
- plt.hist(
65
- probability_default,
66
- color="blue",
67
- bins=bins,
68
- histtype="bar",
69
- ec="white",
70
- )
71
-
72
- # Add a reference line to the plot for the threshold
73
- plt.axvline(x=acc_rate_thresh, color="red")
74
- plt.title("Acceptance Rate Thershold")
75
-
76
- return (
77
- fig,
78
- probability_stat_distribution,
79
- acc_rate_thresh,
80
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/visualization/metrics.py DELETED
@@ -1,132 +0,0 @@
1
-
2
- import pandas as pd
3
- import streamlit as st
4
-
5
-
6
- def streamlit_2columns_metrics_pct_df(
7
- column1name_label: str,
8
- column2name_label: str,
9
- df: pd.DataFrame,
10
- ):
11
- (
12
- column1name,
13
- column2name,
14
- ) = st.columns(2)
15
-
16
- with column1name:
17
- st.metric(
18
- label=column1name_label,
19
- value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
20
- delta=None,
21
- delta_color="normal",
22
- )
23
-
24
- with column2name:
25
- st.metric(
26
- label=column2name_label,
27
- value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
28
- delta=None,
29
- delta_color="normal",
30
- )
31
-
32
-
33
- def streamlit_2columns_metrics_df(
34
- column1name_label: str,
35
- column2name_label: str,
36
- df: pd.DataFrame,
37
- ):
38
- (
39
- column1name,
40
- column2name,
41
- ) = st.columns(2)
42
-
43
- with column1name:
44
- st.metric(
45
- label=column1name_label,
46
- value=df.value_counts().get(1),
47
- delta=None,
48
- delta_color="normal",
49
- )
50
-
51
- with column2name:
52
- st.metric(
53
- label=column2name_label,
54
- value=df.value_counts().get(0),
55
- delta=None,
56
- delta_color="normal",
57
- )
58
-
59
-
60
- def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
61
- (
62
- column1name,
63
- column2name,
64
- ) = st.columns(2)
65
-
66
- with column1name:
67
- st.metric(
68
- label="Rows",
69
- value=df.shape[0],
70
- delta=None,
71
- delta_color="normal",
72
- )
73
-
74
- with column2name:
75
- st.metric(
76
- label="Columns",
77
- value=df.shape[1],
78
- delta=None,
79
- delta_color="normal",
80
- )
81
-
82
-
83
- def streamlit_2columns_metrics_pct_series(
84
- column1name_label: str,
85
- column2name_label: str,
86
- series: pd.Series,
87
- ):
88
- (
89
- column1name,
90
- column2name,
91
- ) = st.columns(2)
92
- with column1name:
93
- st.metric(
94
- label=column1name_label,
95
- value="{:.0%}".format(series.get(1) / series.sum()),
96
- delta=None,
97
- delta_color="normal",
98
- )
99
-
100
- with column2name:
101
- st.metric(
102
- label=column2name_label,
103
- value="{:.0%}".format(series.get(0) / series.sum()),
104
- delta=None,
105
- delta_color="normal",
106
- )
107
-
108
-
109
- def streamlit_2columns_metrics_series(
110
- column1name_label: str,
111
- column2name_label: str,
112
- series: pd.Series,
113
- ):
114
- (
115
- column1name,
116
- column2name,
117
- ) = st.columns(2)
118
- with column1name:
119
- st.metric(
120
- label=column1name_label,
121
- value=series.get(1),
122
- delta=None,
123
- delta_color="normal",
124
- )
125
-
126
- with column2name:
127
- st.metric(
128
- label=column2name_label,
129
- value=series.get(0),
130
- delta=None,
131
- delta_color="normal",
132
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{src/features β†’ views}/__init__.py RENAMED
File without changes
views/decision_tree.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from common.data import SplitDataset
2
+ import streamlit as st
3
+ from common.util import (
4
+ test_variables_gbt,
5
+ )
6
+ from common.views import (
7
+ streamlit_chart_setting_height_width,
8
+ plot_importance_gbt,
9
+ plot_tree_gbt,
10
+ download_importance_gbt,
11
+ download_tree_gbt,
12
+ )
13
+ from views.typing import ModelView
14
+ from views.threshold import decision_tree_threshold_view
15
+ from views.evaluation import decision_tree_evaluation_view
16
+
17
+
18
+ def decisiontree_view(split_dataset: SplitDataset, currency: str):
19
+ st.header("Decision Trees")
20
+
21
+ clf_gbt_model = test_variables_gbt(
22
+ split_dataset.X_train, split_dataset.y_train
23
+ )
24
+
25
+ st.subheader("Decision Tree Feature Importance")
26
+
27
+ (barxsize, barysize,) = streamlit_chart_setting_height_width(
28
+ "Chart Settings", 10, 15, "barxsize", "barysize"
29
+ )
30
+
31
+ fig1 = plot_importance_gbt(clf_gbt_model, barxsize, barysize)
32
+
33
+ st.pyplot(fig1)
34
+
35
+ download_importance_gbt(fig1, barxsize, barysize)
36
+
37
+ st.subheader("Decision Tree Structure")
38
+
39
+ (treexsize, treeysize,) = streamlit_chart_setting_height_width(
40
+ "Chart Settings", 15, 10, "treexsize", "treeysize"
41
+ )
42
+
43
+ fig2 = plot_tree_gbt(treexsize, treeysize, clf_gbt_model)
44
+
45
+ st.pyplot(fig2)
46
+
47
+ download_tree_gbt(treexsize, treeysize)
48
+ st.markdown(
49
+ "Note: The downloaded decision tree plot chart in png has higher resolution than that displayed here."
50
+ )
51
+
52
+ threshold = decision_tree_threshold_view(clf_gbt_model, split_dataset)
53
+
54
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
55
+ decision_tree_evaluation_view(
56
+ clf_gbt_model,
57
+ split_dataset,
58
+ currency,
59
+ threshold.probability_threshold_selected,
60
+ threshold.predicted_default_status,
61
+ )
62
+ )
63
+
64
+ return ModelView(
65
+ model=clf_gbt_model,
66
+ trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
67
+ probability_threshold_selected=threshold.probability_threshold_selected,
68
+ predicted_default_status=threshold.predicted_default_status,
69
+ prediction_probability_df=threshold.prediction_probability_df,
70
+ )
src/models/util_test.py β†’ views/evaluation.py RENAMED
@@ -1,6 +1,5 @@
1
  from typing import Union
2
  import pandas as pd
3
- from sklearn.model_selection import StratifiedKFold, cross_val_score
4
  import streamlit as st
5
  import numpy as np
6
  from sklearn.metrics import (
@@ -8,25 +7,24 @@ from sklearn.metrics import (
8
  confusion_matrix,
9
  )
10
  from sklearn.linear_model import LogisticRegression
11
- import xgboost as xgb
12
  from xgboost.sklearn import XGBClassifier
13
- from features.util_build_features import SplitDataset
14
- """from models.model_utils import (
15
  create_cross_validation_df,
16
  cross_validation_scores,
17
  get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
18
- )"""
19
- from visualization.graphs_test import (
20
  cross_validation_graph,
21
  )
22
 
23
 
24
- def make_tests_view(
25
  model_name_short: str,
26
  model_name_generic: str,
27
  ):
28
  def view(
29
- clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
30
  split_dataset: SplitDataset,
31
  currency: str,
32
  prob_thresh_selected,
@@ -42,7 +40,7 @@ def make_tests_view(
42
  train on each fold suggests performance will be stable."
43
  )
44
 
45
- st.write(f'{model_name_short} cross validation test:')
46
 
47
  stcol_seed, stcol_eval_metric = st.columns(2)
48
 
@@ -172,7 +170,7 @@ def make_tests_view(
172
  )
173
 
174
  cv_scores = cross_validation_scores(
175
- clf_xgbt_model,
176
  split_dataset.X_test,
177
  split_dataset.y_test,
178
  nfolds_score,
@@ -327,7 +325,7 @@ def make_tests_view(
327
 
328
  df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
329
  get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
330
- clf_xgbt_model,
331
  split_dataset.X_test,
332
  split_dataset.y_test,
333
  prob_thresh_selected,
@@ -408,161 +406,5 @@ def make_tests_view(
408
  return view
409
 
410
 
411
- def cross_validation_scores(model, X, y, nfold, score, seed):
412
- # return cv scores of metric
413
- return cross_val_score(
414
- model,
415
- np.ascontiguousarray(X),
416
- np.ravel(np.ascontiguousarray(y)),
417
- cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
418
- scoring=score,
419
- )
420
-
421
-
422
- def create_cross_validation_df(
423
- X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
424
- ):
425
- # Test data x and y
426
- DTrain = xgb.DMatrix(X, label=y)
427
-
428
- # auc or logloss
429
- params = {
430
- "eval_metric": eval_metric,
431
- "objective": "binary:logistic", # logistic say 0 or 1 for loan status
432
- "seed": seed,
433
- }
434
-
435
- # Create the data frame of cross validations
436
- cv_df = xgb.cv(
437
- params,
438
- DTrain,
439
- num_boost_round=trees,
440
- nfold=n_folds,
441
- early_stopping_rounds=early_stopping_rounds,
442
- shuffle=True,
443
- )
444
-
445
- return [DTrain, cv_df]
446
-
447
-
448
- def create_accept_rate_list(start, end, samples):
449
- return np.linspace(start, end, samples, endpoint=True)
450
-
451
-
452
- def create_strategyTable_df(
453
- start, end, samples, actual_probability_predicted_acc_rate, true, currency
454
- ):
455
- accept_rates = create_accept_rate_list(start, end, samples)
456
- thresholds_strat = []
457
- bad_rates_start = []
458
- Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
459
- num_accepted_loans_start = []
460
-
461
- for rate in accept_rates:
462
- # Calculate the threshold for the acceptance rate
463
- thresh = np.quantile(
464
- actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
465
- ).round(3)
466
- # Add the threshold value to the list of thresholds
467
- thresholds_strat.append(
468
- np.quantile(
469
- actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
470
- ).round(3)
471
- )
472
-
473
- # Reassign the loan_status value using the threshold
474
- actual_probability_predicted_acc_rate[
475
- "PREDICT_DEFAULT_STATUS"
476
- ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
477
- lambda x: 1 if x > thresh else 0
478
- )
479
-
480
- # Create a set of accepted loans using this acceptance rate
481
- accepted_loans = actual_probability_predicted_acc_rate[
482
- actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
483
- == 0
484
- ]
485
- # Calculate and append the bad rate using the acceptance rate
486
- bad_rates_start.append(
487
- np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
488
- )
489
- # Accepted loans
490
- num_accepted_loans_start.append(len(accepted_loans))
491
-
492
- # Calculate estimated value
493
- money_accepted_loans = [
494
- accepted_loans * Avg_Loan_Amnt
495
- for accepted_loans in num_accepted_loans_start
496
- ]
497
-
498
- money_bad_accepted_loans = [
499
- 2 * money_accepted_loan * bad_rate
500
- for money_accepted_loan, bad_rate in zip(
501
- money_accepted_loans, bad_rates_start
502
- )
503
- ]
504
-
505
- zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
506
- estimated_value = [
507
- money_accepted_loan - money_bad_accepted_loan
508
- for money_accepted_loan, money_bad_accepted_loan in zip_object
509
- ]
510
-
511
- accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
512
-
513
- thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
514
-
515
- bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
516
-
517
- estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
518
-
519
- return (
520
- pd.DataFrame(
521
- zip(
522
- accept_rates,
523
- thresholds_strat,
524
- bad_rates_start,
525
- num_accepted_loans_start,
526
- estimated_value,
527
- ),
528
- columns=[
529
- "Acceptance Rate",
530
- "Threshold",
531
- "Bad Rate",
532
- "Num Accepted Loans",
533
- f"Estimated Value ({currency})",
534
- ],
535
- )
536
- .sort_values(by="Acceptance Rate", axis=0, ascending=False)
537
- .reset_index(drop=True)
538
- )
539
-
540
-
541
- def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
542
- model, X, y, threshold, loan_amount_col_name
543
- ):
544
- true_status = y.to_frame()
545
-
546
- loan_amount = X[loan_amount_col_name]
547
-
548
- clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
549
-
550
- clf_prediction_prob_df = pd.DataFrame(
551
- clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
552
- )
553
-
554
- clf_thresh_predicted_default_status = (
555
- clf_prediction_prob_df["PROB_DEFAULT"]
556
- .apply(lambda x: 1 if x > threshold else 0)
557
- .rename("PREDICT_DEFAULT_STATUS")
558
- )
559
-
560
- return pd.concat(
561
- [
562
- true_status.reset_index(drop=True),
563
- clf_prediction_prob_df.reset_index(drop=True),
564
- clf_thresh_predicted_default_status.reset_index(drop=True),
565
- loan_amount.reset_index(drop=True),
566
- ],
567
- axis=1,
568
- )
 
1
  from typing import Union
2
  import pandas as pd
 
3
  import streamlit as st
4
  import numpy as np
5
  from sklearn.metrics import (
 
7
  confusion_matrix,
8
  )
9
  from sklearn.linear_model import LogisticRegression
 
10
  from xgboost.sklearn import XGBClassifier
11
+ from common.data import SplitDataset
12
+ from common.util import (
13
  create_cross_validation_df,
14
  cross_validation_scores,
15
  get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
16
+ )
17
+ from common.views import (
18
  cross_validation_graph,
19
  )
20
 
21
 
22
+ def make_evaluation_view(
23
  model_name_short: str,
24
  model_name_generic: str,
25
  ):
26
  def view(
27
+ clf_gbt_model: Union[XGBClassifier, LogisticRegression],
28
  split_dataset: SplitDataset,
29
  currency: str,
30
  prob_thresh_selected,
 
40
  train on each fold suggests performance will be stable."
41
  )
42
 
43
+ st.write(f"XGBoost cross validation test:")
44
 
45
  stcol_seed, stcol_eval_metric = st.columns(2)
46
 
 
170
  )
171
 
172
  cv_scores = cross_validation_scores(
173
+ clf_gbt_model,
174
  split_dataset.X_test,
175
  split_dataset.y_test,
176
  nfolds_score,
 
325
 
326
  df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
327
  get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
328
+ clf_gbt_model,
329
  split_dataset.X_test,
330
  split_dataset.y_test,
331
  prob_thresh_selected,
 
406
  return view
407
 
408
 
409
+ decision_tree_evaluation_view = make_evaluation_view("gbt", "Decision Tree")
410
+ logistic_evaluation_view = make_evaluation_view("lg", "Logistic Regression")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/logistic.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from common.data import SplitDataset
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ from views.threshold import logistic_threshold_view
6
+ from views.evaluation import logistic_evaluation_view
7
+ from common.util import (
8
+ test_variables_logistic,
9
+ print_coeff_logistic,
10
+ model_probability_values_df,
11
+ apply_threshold_to_probability_values,
12
+ )
13
+ from common.views import (
14
+ streamlit_2columns_metrics_df,
15
+ streamlit_2columns_metrics_pct_df,
16
+ )
17
+ from views.typing import ModelView
18
+
19
+
20
+ def logistic_view(split_dataset: SplitDataset, currency: str) -> ModelView:
21
+ # ### Test and create variables logically
22
+
23
+ st.header("Logistic Regression")
24
+
25
+ clf_logistic_model = test_variables_logistic(
26
+ split_dataset.X_train, split_dataset.y_train
27
+ )
28
+
29
+ st.metric(
30
+ label="# of Coefficients in Logistic Regression",
31
+ value=clf_logistic_model.n_features_in_,
32
+ delta=None,
33
+ delta_color="normal",
34
+ )
35
+
36
+ coef_dict = print_coeff_logistic(clf_logistic_model, split_dataset)
37
+
38
+ st.subheader("Logistic Regression Coefficient Values")
39
+
40
+ coef_dict_sorted = dict(
41
+ sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
42
+ )
43
+
44
+ data_items = coef_dict_sorted.items()
45
+ data_list = list(data_items)
46
+
47
+ df = pd.DataFrame(data_list, columns=["Coefficient", "Value"])
48
+
49
+ fig1 = px.bar(data_frame=df, x="Value", y="Coefficient", orientation="h")
50
+
51
+ fig1.update_layout(
52
+ title="Logistic Regression Coefficients",
53
+ xaxis_title="Value",
54
+ yaxis_title="Coefficient",
55
+ )
56
+
57
+ st.plotly_chart(fig1)
58
+
59
+ st.subheader("Classification Probability Threshold")
60
+
61
+ st.write(
62
+ """
63
+ The logistic regression model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
64
+ Probabilities of defaulting of the loans are compared to a probability threshold.\n
65
+ A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
66
+ """
67
+ )
68
+
69
+ threshold = st.slider(
70
+ label="Default Probability Threshold:",
71
+ min_value=0.0,
72
+ max_value=1.0,
73
+ value=0.7,
74
+ key="key_threshold",
75
+ )
76
+
77
+ clf_prediction_prob_df_log = model_probability_values_df(
78
+ clf_logistic_model,
79
+ split_dataset.X_test,
80
+ )
81
+
82
+ clf_thresh_predicted_default_status_user = (
83
+ apply_threshold_to_probability_values(
84
+ clf_prediction_prob_df_log,
85
+ threshold,
86
+ )
87
+ )
88
+
89
+ streamlit_2columns_metrics_df(
90
+ "# of Predicted Defaults",
91
+ "# of Predicted Non-Default",
92
+ clf_thresh_predicted_default_status_user,
93
+ )
94
+
95
+ streamlit_2columns_metrics_pct_df(
96
+ "% of Loans Predicted to Default",
97
+ "% of Loans Predicted not to Default",
98
+ clf_thresh_predicted_default_status_user,
99
+ )
100
+
101
+ threshold = logistic_threshold_view(clf_logistic_model, split_dataset)
102
+
103
+ df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
104
+ logistic_evaluation_view(
105
+ clf_logistic_model,
106
+ split_dataset,
107
+ currency,
108
+ threshold.probability_threshold_selected,
109
+ threshold.predicted_default_status,
110
+ )
111
+ )
112
+
113
+ return ModelView(
114
+ model=clf_logistic_model,
115
+ trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
116
+ probability_threshold_selected=threshold.probability_threshold_selected,
117
+ predicted_default_status=threshold.predicted_default_status,
118
+ prediction_probability_df=threshold.prediction_probability_df,
119
+ )
src/models/util_model_comparison.py β†’ views/model_comparison.py RENAMED
@@ -1,21 +1,16 @@
1
  from typing import OrderedDict
2
  import streamlit as st
3
  from sklearn.metrics import roc_auc_score
4
- from features.util_build_features import SplitDataset
5
- from visualization.graphs_settings import (
6
- streamlit_chart_setting_height_width
7
- )
8
-
9
- from visualization.graphs_test import (
10
  roc_auc_compare_n_models,
11
- calibration_curve_report_commented_n
 
12
  )
 
13
 
14
 
15
- from models.util_model_class import ModelClass
16
-
17
-
18
- def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelClass):
19
  roc_auc_model = roc_auc_score(
20
  split_dataset.y_test, model_view.predicted_default_status
21
  )
@@ -36,7 +31,7 @@ def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelClass):
36
 
37
  def model_comparison_view(
38
  split_dataset: SplitDataset,
39
- model_views: OrderedDict[str, ModelClass],
40
  ):
41
  st.header("Model Comparison")
42
 
@@ -48,7 +43,7 @@ def model_comparison_view(
48
  f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
49
  )
50
  st.markdown(
51
- f'Area Under the Receiver Operating Characteristic Curve from prediction scores from {model_name} model is {roc_auc_model}.\n'
52
  )
53
  st.markdown(
54
  f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
@@ -83,4 +78,4 @@ def model_comparison_view(
83
 
84
  fig2.set_size_inches(xsize_cal, ysize_cal)
85
 
86
- st.pyplot(fig2)
 
1
  from typing import OrderedDict
2
  import streamlit as st
3
  from sklearn.metrics import roc_auc_score
4
+ from common.data import SplitDataset
5
+ from common.views import (
 
 
 
 
6
  roc_auc_compare_n_models,
7
+ streamlit_chart_setting_height_width,
8
+ calibration_curve_report_commented_n,
9
  )
10
+ from views.typing import ModelView
11
 
12
 
13
+ def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelView):
 
 
 
14
  roc_auc_model = roc_auc_score(
15
  split_dataset.y_test, model_view.predicted_default_status
16
  )
 
31
 
32
  def model_comparison_view(
33
  split_dataset: SplitDataset,
34
+ model_views: OrderedDict[str, ModelView],
35
  ):
36
  st.header("Model Comparison")
37
 
 
43
  f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
44
  )
45
  st.markdown(
46
+ f'Area Under the Receiver Operating Characteristic Curve from prediction scores from "{model_name}" model is {roc_auc_model}.\n'
47
  )
48
  st.markdown(
49
  f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
 
78
 
79
  fig2.set_size_inches(xsize_cal, ysize_cal)
80
 
81
+ st.pyplot(fig2.figure)
src/models/util_strategy_table.py β†’ views/strategy_table.py RENAMED
@@ -2,12 +2,12 @@ from typing import OrderedDict
2
  import plotly.express as px
3
  import numpy as np
4
  import streamlit as st
5
- from models.util_test import create_strategyTable_df
6
- from models.util_model_class import ModelClass
7
 
8
 
9
  def strategy_table_view(
10
- currency: str, model_views: OrderedDict[str, ModelClass]
11
  ):
12
  st.header("Strategy Table")
13
 
@@ -89,7 +89,7 @@ def strategy_table_view(
89
  )
90
 
91
  st.metric(
92
- label='Total expected loss:',
93
  value=f"{currency} {tot_exp_loss:,.2f}",
94
  delta=None,
95
  delta_color="normal",
 
2
  import plotly.express as px
3
  import numpy as np
4
  import streamlit as st
5
+ from common.util import create_strategyTable_df
6
+ from views.typing import ModelView
7
 
8
 
9
  def strategy_table_view(
10
+ currency: str, model_views: OrderedDict[str, ModelView]
11
  ):
12
  st.header("Strategy Table")
13
 
 
89
  )
90
 
91
  st.metric(
92
+ label=f"Total expected loss:",
93
  value=f"{currency} {tot_exp_loss:,.2f}",
94
  delta=None,
95
  delta_color="normal",
views/threshold.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Union, cast
3
+ import numpy as np
4
+ import streamlit as st
5
+ import plotly.express as px
6
+ import pandas as pd
7
+ from xgboost.sklearn import XGBClassifier
8
+ from sklearn.linear_model import LogisticRegression
9
+ from common.data import SplitDataset
10
+ from common.util import (
11
+ model_probability_values_df,
12
+ apply_threshold_to_probability_values,
13
+ find_best_threshold_J_statistic,
14
+ default_status_per_threshold,
15
+ classification_report_per_threshold,
16
+ thresh_classification_report_recall_accuracy,
17
+ )
18
+ from common.views import (
19
+ streamlit_2columns_metrics_df,
20
+ streamlit_2columns_metrics_pct_df,
21
+ )
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class Threshold:
26
+ probability_threshold_selected: float
27
+ predicted_default_status: pd.Series
28
+ prediction_probability_df: pd.DataFrame
29
+
30
+
31
+ def make_threshold_view(
32
+ model_name_short: str,
33
+ model_name: str,
34
+ ):
35
+ def view(
36
+ clf_gbt_model: Union[XGBClassifier, LogisticRegression],
37
+ split_dataset: SplitDataset,
38
+ ) -> Threshold:
39
+ st.subheader("Classification Probability Threshold - User Defined")
40
+ st.write(
41
+ f"""
42
+ The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
43
+ Probabilities of defaulting of the loans are compared to a probability threshold.\n
44
+ A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
45
+ """
46
+ )
47
+
48
+ threshold_gbt_default = st.slider(
49
+ label="Default Probability Threshold:",
50
+ min_value=0.0,
51
+ max_value=1.0,
52
+ value=0.8,
53
+ key=f"threshold_{model_name_short}_default",
54
+ )
55
+
56
+ clf_prediction_prob_df_gbt = model_probability_values_df(
57
+ clf_gbt_model,
58
+ split_dataset.X_test,
59
+ )
60
+
61
+ clf_thresh_predicted_default_status_user_gbt = (
62
+ apply_threshold_to_probability_values(
63
+ clf_prediction_prob_df_gbt,
64
+ threshold_gbt_default,
65
+ )
66
+ )
67
+
68
+ streamlit_2columns_metrics_df(
69
+ "# of Predicted Defaults",
70
+ "# of Predicted Non-Default",
71
+ clf_thresh_predicted_default_status_user_gbt,
72
+ )
73
+
74
+ streamlit_2columns_metrics_pct_df(
75
+ "% of Loans Predicted to Default",
76
+ "% of Loans Predicted not to Default",
77
+ clf_thresh_predicted_default_status_user_gbt,
78
+ )
79
+
80
+ st.subheader("J Statistic Driven Classification Probability Threshold")
81
+
82
+ J_statistic_best_threshold = find_best_threshold_J_statistic(
83
+ split_dataset.y_test, clf_prediction_prob_df_gbt
84
+ )
85
+ st.metric(
86
+ label="Youden's J statistic calculated best threshold",
87
+ value=J_statistic_best_threshold,
88
+ )
89
+
90
+ clf_thresh_predicted_default_status_Jstatistic_gbt = (
91
+ apply_threshold_to_probability_values(
92
+ clf_prediction_prob_df_gbt,
93
+ J_statistic_best_threshold,
94
+ )
95
+ )
96
+
97
+ streamlit_2columns_metrics_df(
98
+ "# of Predicted Defaults",
99
+ "# of Predicted Non-Default",
100
+ clf_thresh_predicted_default_status_Jstatistic_gbt,
101
+ )
102
+
103
+ streamlit_2columns_metrics_pct_df(
104
+ "% of Loans Predicted to Default",
105
+ "% of Loans Predicted not to Default",
106
+ clf_thresh_predicted_default_status_Jstatistic_gbt,
107
+ )
108
+
109
+ st.subheader(
110
+ "Recall and Accuracy Tradeoff with given Probability Threshold"
111
+ )
112
+ # Steps
113
+ # Get list of thresholds
114
+ # Get default status per threshold
115
+ # Get classification report per threshold
116
+ # Get recall, nondef recall, and accuracy per threshold
117
+
118
+ threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist()
119
+
120
+ threshold_default_status_list = default_status_per_threshold(
121
+ threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
122
+ )
123
+ thresh_classification_report_dict = (
124
+ classification_report_per_threshold(
125
+ threshold_list,
126
+ threshold_default_status_list,
127
+ split_dataset.y_test,
128
+ )
129
+ )
130
+
131
+ (
132
+ thresh_def_recalls_list,
133
+ thresh_nondef_recalls_list,
134
+ thresh_accs_list,
135
+ ) = thresh_classification_report_recall_accuracy(
136
+ thresh_classification_report_dict
137
+ )
138
+
139
+ namelist = [
140
+ "Default Recall",
141
+ "Non Default Recall",
142
+ "Accuracy",
143
+ "Threshold",
144
+ ]
145
+
146
+ df = pd.DataFrame(
147
+ [
148
+ thresh_def_recalls_list,
149
+ thresh_nondef_recalls_list,
150
+ thresh_accs_list,
151
+ threshold_list,
152
+ ],
153
+ index=namelist,
154
+ )
155
+
156
+ df = df.T
157
+
158
+ fig2 = px.line(
159
+ data_frame=df,
160
+ y=["Default Recall", "Non Default Recall", "Accuracy"],
161
+ x="Threshold",
162
+ )
163
+
164
+ fig2.update_layout(
165
+ title="Recall and Accuracy score Trade-off with Probability Threshold",
166
+ xaxis_title="Probability Threshold",
167
+ yaxis_title="Score",
168
+ )
169
+ fig2.update_yaxes(range=[0.0, 1.0])
170
+
171
+ st.plotly_chart(fig2)
172
+
173
+ st.subheader("Acceptance Rate Driven Probability Threshold")
174
+ # Steps
175
+ # Set acceptance rate
176
+ # Get default status per threshold
177
+ # Get classification report per threshold
178
+ # Get recall, nondef recall, and accuracy per threshold
179
+
180
+ acceptance_rate = (
181
+ st.slider(
182
+ label="% of loans accepted (acceptance rate):",
183
+ min_value=0,
184
+ max_value=100,
185
+ value=85,
186
+ key=f"acceptance_rate_{model_name_short}",
187
+ format="%f%%",
188
+ )
189
+ / 100
190
+ )
191
+
192
+ acc_rate_thresh_gbt = np.quantile(
193
+ clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
194
+ )
195
+
196
+ st.write(
197
+ f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
198
+ )
199
+
200
+ figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
201
+
202
+ figa.update_layout(
203
+ title="Acceptance Rate Threshold vs. Loans Accepted",
204
+ xaxis_title="Acceptance Rate Threshold",
205
+ yaxis_title="Loans Accepted",
206
+ )
207
+
208
+ figa.update_traces(marker_line_width=1, marker_line_color="white")
209
+
210
+ figa.add_vline(
211
+ x=acc_rate_thresh_gbt,
212
+ line_width=3,
213
+ line_dash="solid",
214
+ line_color="red",
215
+ )
216
+
217
+ st.plotly_chart(figa)
218
+
219
+ clf_thresh_predicted_default_status_acceptance_gbt = (
220
+ apply_threshold_to_probability_values(
221
+ clf_prediction_prob_df_gbt,
222
+ acc_rate_thresh_gbt,
223
+ )
224
+ )
225
+
226
+ st.write()
227
+ st.subheader("Selected Probability Threshold")
228
+
229
+ options = [
230
+ "User Defined",
231
+ "J Statistic Driven",
232
+ "Acceptance Rate Driven",
233
+ ]
234
+ prob_thresh_option = st.radio(
235
+ label="Selected Probability Threshold",
236
+ options=options,
237
+ key=f"{model_name_short}_radio_thresh",
238
+ )
239
+
240
+ if prob_thresh_option == "User Defined":
241
+ prob_thresh_selected_gbt = threshold_gbt_default
242
+ predicted_default_status_gbt = (
243
+ clf_thresh_predicted_default_status_user_gbt
244
+ )
245
+ elif prob_thresh_option == "J Statistic Driven":
246
+ prob_thresh_selected_gbt = J_statistic_best_threshold
247
+ predicted_default_status_gbt = (
248
+ clf_thresh_predicted_default_status_Jstatistic_gbt
249
+ )
250
+ else:
251
+ prob_thresh_selected_gbt = acc_rate_thresh_gbt
252
+ predicted_default_status_gbt = (
253
+ clf_thresh_predicted_default_status_acceptance_gbt
254
+ )
255
+
256
+ st.write(
257
+ f"Selected probability threshold is {prob_thresh_selected_gbt}"
258
+ )
259
+
260
+ return Threshold(
261
+ probability_threshold_selected=cast(
262
+ float, prob_thresh_selected_gbt
263
+ ),
264
+ predicted_default_status=predicted_default_status_gbt,
265
+ prediction_probability_df=clf_prediction_prob_df_gbt,
266
+ )
267
+
268
+ return view
269
+
270
+
271
+ decision_tree_threshold_view = make_threshold_view("gbt", "decision tree")
272
+ logistic_threshold_view = make_threshold_view("lg", "logistic")
src/models/util_model_class.py β†’ views/typing.py RENAMED
@@ -7,7 +7,7 @@ from sklearn.linear_model import LogisticRegression
7
 
8
 
9
  @dataclass(frozen=True)
10
- class ModelClass:
11
  model: Union[XGBClassifier, LogisticRegression]
12
  probability_threshold_selected: float
13
  predicted_default_status: pd.Series
 
7
 
8
 
9
  @dataclass(frozen=True)
10
+ class ModelView:
11
  model: Union[XGBClassifier, LogisticRegression]
12
  probability_threshold_selected: float
13
  predicted_default_status: pd.Series