|
|
|
"""loan.py""" |
|
|
|
|
|
import numpy as np |
|
import pandas as pd |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import warnings |
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
from sklearn.compose import ColumnTransformer |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.linear_model import LogisticRegression |
|
from imblearn.over_sampling import SMOTE |
|
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score |
|
import gradio as gr |
|
from imblearn.pipeline import Pipeline as ImbPipeline |
|
import joblib |
|
from datasets import load_dataset |
|
|
|
|
|
warnings.simplefilter(action='ignore', category=FutureWarning) |
|
|
|
|
|
dataset = load_dataset("AnguloM/loan_data") |
|
|
|
|
|
df_train = dataset['train'] |
|
|
|
|
|
|
|
df_train = pd.DataFrame(df_train) |
|
|
|
from sklearn.model_selection import train_test_split |
|
|
|
df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
info_df = pd.DataFrame({ |
|
"Column": df_train.columns, |
|
"Data Type": df_train.dtypes, |
|
"Non-Null Count": df_train.notnull().sum(), |
|
"Total Count": len(df_train) |
|
}) |
|
|
|
|
|
info_df['Non-Null Percentage'] = (info_df['Non-Null Count'] / info_df['Total Count'] * 100).round(2).astype(str) + '%' |
|
|
|
|
|
info_df_styled = info_df.style.set_properties(**{'text-align': 'left'}).set_table_styles( |
|
[{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}] |
|
) |
|
|
|
|
|
info_df_styled = info_df_styled.background_gradient(subset=['Non-Null Count', 'Total Count'], cmap="Oranges") |
|
|
|
|
|
table_widget = widgets.Output() |
|
with table_widget: |
|
display(info_df_styled) |
|
|
|
|
|
message_widget = widgets.Output() |
|
with message_widget: |
|
print(f"\033[1;31mMissing values detected in any columns:\033[0m\n{df_train.isnull().sum()}") |
|
|
|
|
|
widgets.HBox([table_widget, message_widget]) |
|
|
|
|
|
df_train['not.fully.paid'] = df_train['not.fully.paid'].astype('category') |
|
|
|
|
|
df_numeric = df_train.select_dtypes(include=[float, int]) |
|
|
|
|
|
plt.figure(figsize=(12, 6)) |
|
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(14, 6)) |
|
|
|
|
|
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', ax=axes[0], fmt='.2f') |
|
axes[0].set_title('Correlation Matrix') |
|
|
|
|
|
sns.countplot(x='not.fully.paid', data=df_train, ax=axes[1]) |
|
axes[1].set_title('Distribution of Loan Repayment Status') |
|
|
|
|
|
plt.tight_layout() |
|
plt.show() |
|
|
|
|
|
|
|
data = df_train.copy() |
|
|
|
|
|
X = data.drop('credit.policy', axis=1) |
|
y = data['credit.policy'] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
preprocessor = ColumnTransformer( |
|
transformers=[ |
|
('num', StandardScaler(), ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', |
|
'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths', |
|
'delinq.2yrs', 'pub.rec']), |
|
('cat', OneHotEncoder(), ['purpose']) |
|
] |
|
) |
|
|
|
|
|
imb_model_pipeline = ImbPipeline(steps=[ |
|
('preprocessor', preprocessor), |
|
('smote', SMOTE(random_state=42, sampling_strategy=0.5)), |
|
('classifier', LogisticRegression(max_iter=1000000)) |
|
]) |
|
|
|
|
|
imb_model_pipeline.fit(X_train, y_train) |
|
|
|
|
|
y_pred = imb_model_pipeline.predict(X_test) |
|
y_pred_proba = imb_model_pipeline.predict_proba(X_test)[:, 1] |
|
|
|
|
|
threshold = 0.3 |
|
y_pred_adjusted = (y_pred_proba >= threshold).astype(int) |
|
|
|
|
|
classification_rep = classification_report(y_test, y_pred_adjusted, output_dict=True) |
|
|
|
|
|
classification_df = pd.DataFrame(classification_rep).transpose() |
|
classification_df_styled = classification_df.style.set_properties(**{'text-align': 'center'}).set_table_styles( |
|
[{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}] |
|
) |
|
|
|
|
|
table_widget = widgets.Output() |
|
with table_widget: |
|
display(classification_df_styled) |
|
|
|
|
|
auc_roc = roc_auc_score(y_test, y_pred_proba) |
|
|
|
|
|
auc_widget = widgets.Output() |
|
with auc_widget: |
|
print("\033[1;31mAUC-ROC:\033[0m", f"{auc_roc:.4f}") |
|
|
|
|
|
display(widgets.VBox([table_widget, auc_widget])) |
|
|
|
|
|
cm = confusion_matrix(y_test, y_pred_adjusted) |
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') |
|
plt.title("Confusion Matrix") |
|
plt.xlabel("Predicted") |
|
plt.ylabel("Actual") |
|
plt.show() |
|
|
|
from huggingface_hub import hf_hub_download |
|
import joblib |
|
|
|
|
|
model_path = hf_hub_download(repo_id="AnguloM/LoanSmart_Predict_Loan_Approval_with_Confidence", filename="loan_approval_pipeline.pkl") |
|
|
|
|
|
pipeline = joblib.load(model_path) |
|
|
|
def predict_approval(int_rate, installment, log_annual_inc, dti, fico, |
|
days_with_cr_line, revol_bal, revol_util, inq_last_6mths, |
|
delinq_2yrs, pub_rec, purpose): |
|
|
|
input_data = pd.DataFrame([[int_rate, installment, log_annual_inc, dti, fico, |
|
days_with_cr_line, revol_bal, revol_util, |
|
inq_last_6mths, delinq_2yrs, pub_rec, purpose]], |
|
columns=['int.rate', 'installment', 'log.annual.inc', |
|
'dti', 'fico', 'days.with.cr.line', 'revol.bal', |
|
'revol.util', 'inq.last.6mths', 'delinq.2yrs', |
|
'pub.rec', 'purpose']) |
|
|
|
result = pipeline.predict(input_data) |
|
return result[0] |
|
|
|
|
|
|
|
inputs = [ |
|
gr.Slider(0.0, 25.0, step=0.1, label="Interest Rate (%)"), |
|
gr.Slider(0.0, 1000.0, step=10.0, label="Installment Amount"), |
|
gr.Slider(0.0, 15.0, step=0.1, label="Log of Annual Income"), |
|
gr.Slider(0.0, 50.0, step=0.1, label="Debt-to-Income Ratio"), |
|
gr.Slider(300, 850, step=1, label="FICO Credit Score"), |
|
gr.Slider(0.0, 50000.0, step=100.0, label="Days with Credit Line"), |
|
gr.Slider(0.0, 100000.0, step=500.0, label="Revolving Balance"), |
|
gr.Slider(0.0, 150.0, step=0.1, label="Revolving Utilization (%)"), |
|
gr.Slider(0, 10, step=1, label="Recent Inquiries (Last 6 Months)"), |
|
gr.Slider(0, 10, step=1, label="Delinquencies in Last 2 Years"), |
|
gr.Slider(0, 5, step=1, label="Public Records"), |
|
gr.Dropdown(["credit_card", "debt_consolidation", "educational", |
|
"home_improvement", "major_purchase", "small_business", |
|
"other"], label="Loan Purpose") |
|
] |
|
|
|
|
|
gr.Interface(fn=predict_approval, inputs=inputs, outputs="text").launch(share=True) |
|
|