File size: 8,711 Bytes
ba086e8
 
 
 
2ce73df
ba086e8
 
 
 
 
65d5829
ba086e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c60494
 
ba086e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# -*- coding: utf-8 -*-
"""loan.py"""

# Import necessary libraries
from IPython.display import display
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import ipywidgets as widgets
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import gradio as gr
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from datasets import load_dataset  # Import the Hugging Face dataset library

# Suppress specific FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load dataset directly from Hugging Face
dataset = load_dataset("AnguloM/loan_data")

# Access the train and test data
df_train = dataset['train']


# Convert dataset to pandas DataFrame
df_train = pd.DataFrame(df_train)

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42)


# Create a summary DataFrame with data types and non-null counts
info_df = pd.DataFrame({
    "Column": df_train.columns,
    "Data Type": df_train.dtypes,
    "Non-Null Count": df_train.notnull().sum(),
    "Total Count": len(df_train)
})

# Calculate the percentage of non-null values in each column
info_df['Non-Null Percentage'] = (info_df['Non-Null Count'] / info_df['Total Count'] * 100).round(2).astype(str) + '%'

# Style the table
info_df_styled = info_df.style.set_properties(**{'text-align': 'left'}).set_table_styles(
    [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
)

# Apply background gradient only to numerical columns
info_df_styled = info_df_styled.background_gradient(subset=['Non-Null Count', 'Total Count'], cmap="Oranges")

# Create a widget to display the styled table
table_widget = widgets.Output()
with table_widget:
    display(info_df_styled)

# Widget for the missing values message
message_widget = widgets.Output()
with message_widget:
    print(f"\033[1;31mMissing values detected in any columns:\033[0m\n{df_train.isnull().sum()}")

# Display both widgets (table and missing values message) side by side
widgets.HBox([table_widget, message_widget])

# Convert relevant columns to categorical if necessary
df_train['not.fully.paid'] = df_train['not.fully.paid'].astype('category')

# Select only numeric columns for correlation matrix calculation
df_numeric = df_train.select_dtypes(include=[float, int])

# Create a 1x2 grid for the plots
plt.figure(figsize=(12, 6))

# Create subplots for the correlation matrix and target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot Correlation Matrix
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', ax=axes[0], fmt='.2f')
axes[0].set_title('Correlation Matrix')

# Plot Distribution of Loan Repayment Status (Target Variable)
sns.countplot(x='not.fully.paid', data=df_train, ax=axes[1])
axes[1].set_title('Distribution of Loan Repayment Status')

# Show the plots
plt.tight_layout()  # Adjusts the layout to avoid overlapping
plt.show()

# OneHotEncoding for categorical columns and scaling for numeric columns
# Prepare data for training
data = df_train.copy()

# Separate features (X) and target (y)
X = data.drop('credit.policy', axis=1)  # Drop the target column
y = data['credit.policy']  # Target variable

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline (scaling numeric features and encoding categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico',
                                   'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths',
                                   'delinq.2yrs', 'pub.rec']),
        ('cat', OneHotEncoder(), ['purpose'])  # Ensure 'purpose' is included in categorical transformations
    ]
)

# Create an imbalanced-learn pipeline that includes SMOTE and Logistic Regression
imb_model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),  # First, preprocess the data (scale numeric, encode categorical)
    ('smote', SMOTE(random_state=42, sampling_strategy=0.5)),  # Apply SMOTE to balance the dataset
    ('classifier', LogisticRegression(max_iter=1000000))  # Logistic Regression classifier
])

# Train the model with the full pipeline (preprocessing + SMOTE + model training)
imb_model_pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = imb_model_pipeline.predict(X_test)
y_pred_proba = imb_model_pipeline.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Adjust the decision threshold to improve recall of the positive class
threshold = 0.3
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

# Evaluate the model using classification report
classification_rep = classification_report(y_test, y_pred_adjusted, output_dict=True)

# Convert the classification report to a DataFrame for display as a table with styles
classification_df = pd.DataFrame(classification_rep).transpose()
classification_df_styled = classification_df.style.set_properties(**{'text-align': 'center'}).set_table_styles(
    [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
)

# Display the classification report as a styled table in a widget
table_widget = widgets.Output()
with table_widget:
    display(classification_df_styled)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_proba)

# Widget for the AUC-ROC
auc_widget = widgets.Output()
with auc_widget:
    print("\033[1;31mAUC-ROC:\033[0m", f"{auc_roc:.4f}")

# Display both widgets (table and AUC-ROC message) side by side
display(widgets.VBox([table_widget, auc_widget]))

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred_adjusted)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

from huggingface_hub import hf_hub_download
import joblib


model_path = hf_hub_download(repo_id="AnguloM/LoanSmart_Predict_Loan_Approval_with_Confidence", filename="loan_approval_pipeline.pkl")


pipeline = joblib.load(model_path)
# Prediction function
def predict_approval(int_rate, installment, log_annual_inc, dti, fico,
                     days_with_cr_line, revol_bal, revol_util, inq_last_6mths,
                     delinq_2yrs, pub_rec, purpose):
    # Prepare the input as a DataFrame
    input_data = pd.DataFrame([[int_rate, installment, log_annual_inc, dti, fico,
                               days_with_cr_line, revol_bal, revol_util,
                               inq_last_6mths, delinq_2yrs, pub_rec, purpose]],
                             columns=['int.rate', 'installment', 'log.annual.inc',
                                      'dti', 'fico', 'days.with.cr.line', 'revol.bal',
                                      'revol.util', 'inq.last.6mths', 'delinq.2yrs',
                                      'pub.rec', 'purpose'])
    # Make loan approval prediction
    result = pipeline.predict(input_data)[0]
    return "Loan Approved" if result == 1 else "Loan Not Approved"


# Create input components for the Gradio interface
inputs = [
    gr.Slider(0.0, 25.0, step=0.1, label="Interest Rate (%)"),
    gr.Slider(0.0, 1000.0, step=10.0, label="Installment Amount"),
    gr.Slider(0.0, 15.0, step=0.1, label="Log of Annual Income"),
    gr.Slider(0.0, 50.0, step=0.1, label="Debt-to-Income Ratio"),
    gr.Slider(300, 850, step=1, label="FICO Credit Score"),
    gr.Slider(0.0, 50000.0, step=100.0, label="Days with Credit Line"),
    gr.Slider(0.0, 100000.0, step=500.0, label="Revolving Balance"),
    gr.Slider(0.0, 150.0, step=0.1, label="Revolving Utilization (%)"),
    gr.Slider(0, 10, step=1, label="Recent Inquiries (Last 6 Months)"),
    gr.Slider(0, 10, step=1, label="Delinquencies in Last 2 Years"),
    gr.Slider(0, 5, step=1, label="Public Records"),
    gr.Dropdown(["credit_card", "debt_consolidation", "educational",
                 "home_improvement", "major_purchase", "small_business",
                 "other"], label="Loan Purpose")
]

# Create the Gradio interface for loan approval prediction
gr.Interface(fn=predict_approval, inputs=inputs, outputs="text").launch(share=True)