# -*- coding: utf-8 -*-
# Import necessary libraries
from IPython.display import display
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import ipywidgets as widgets
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import gradio as gr
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from datasets import load_dataset # Import the Hugging Face dataset library
# Suppress specific FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Load dataset directly from Hugging Face
dataset = load_dataset("AnguloM/loan_data")
# Access the train and test data
df_train = dataset['train']
# Convert dataset to pandas DataFrame
df_train = pd.DataFrame(df_train)
df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42)
# Create a summary DataFrame with data types and non-null counts
info_df = pd.DataFrame({
"Column": df_train.columns,
"Data Type": df_train.dtypes,
"Non-Null Count": df_train.notnull().sum(),
"Total Count": len(df_train)
# Calculate the percentage of non-null values in each column
info_df['Non-Null Percentage'] = (info_df['Non-Null Count'] / info_df['Total Count'] * 100).round(2).astype(str) + '%'
# Style the table
info_df_styled =**{'text-align': 'left'}).set_table_styles(
[{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
# Apply background gradient only to numerical columns
info_df_styled = info_df_styled.background_gradient(subset=['Non-Null Count', 'Total Count'], cmap="Oranges")
# Create a widget to display the styled table
table_widget = widgets.Output()
with table_widget:
# Widget for the missing values message
message_widget = widgets.Output()
with message_widget:
print(f"\033[1;31mMissing values detected in any columns:\033[0m\n{df_train.isnull().sum()}")
# Display both widgets (table and missing values message) side by side
widgets.HBox([table_widget, message_widget])
# Convert relevant columns to categorical if necessary
df_train['not.fully.paid'] = df_train['not.fully.paid'].astype('category')
# Select only numeric columns for correlation matrix calculation
df_numeric = df_train.select_dtypes(include=[float, int])
# Create a 1x2 grid for the plots
plt.figure(figsize=(12, 6))
# Create subplots for the correlation matrix and target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Plot Correlation Matrix
sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', ax=axes[0], fmt='.2f')
axes[0].set_title('Correlation Matrix')
# Plot Distribution of Loan Repayment Status (Target Variable)
sns.countplot(x='not.fully.paid', data=df_train, ax=axes[1])
axes[1].set_title('Distribution of Loan Repayment Status')
# Show the plots
plt.tight_layout() # Adjusts the layout to avoid overlapping
# OneHotEncoding for categorical columns and scaling for numeric columns
# Prepare data for training
data = df_train.copy()
# Separate features (X) and target (y)
X = data.drop('credit.policy', axis=1) # Drop the target column
y = data['credit.policy'] # Target variable
# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocessing pipeline (scaling numeric features and encoding categorical features)
preprocessor = ColumnTransformer(
('num', StandardScaler(), ['int.rate', 'installment', '', 'dti', 'fico',
'', 'revol.bal', 'revol.util', 'inq.last.6mths',
'delinq.2yrs', 'pub.rec']),
('cat', OneHotEncoder(), ['purpose']) # Ensure 'purpose' is included in categorical transformations
# Create an imbalanced-learn pipeline that includes SMOTE and Logistic Regression
imb_model_pipeline = ImbPipeline(steps=[
('preprocessor', preprocessor), # First, preprocess the data (scale numeric, encode categorical)
('smote', SMOTE(random_state=42, sampling_strategy=0.5)), # Apply SMOTE to balance the dataset
('classifier', LogisticRegression(max_iter=1000000)) # Logistic Regression classifier
# Train the model with the full pipeline (preprocessing + SMOTE + model training), y_train)
# Make predictions on the test data
y_pred = imb_model_pipeline.predict(X_test)
y_pred_proba = imb_model_pipeline.predict_proba(X_test)[:, 1] # Get probabilities for the positive class
# Adjust the decision threshold to improve recall of the positive class
threshold = 0.3
y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
# Evaluate the model using classification report
classification_rep = classification_report(y_test, y_pred_adjusted, output_dict=True)
# Convert the classification report to a DataFrame for display as a table with styles
classification_df = pd.DataFrame(classification_rep).transpose()
classification_df_styled =**{'text-align': 'center'}).set_table_styles(
[{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
# Display the classification report as a styled table in a widget
table_widget = widgets.Output()
with table_widget:
# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred_proba)
# Widget for the AUC-ROC
auc_widget = widgets.Output()
with auc_widget:
print("\033[1;31mAUC-ROC:\033[0m", f"{auc_roc:.4f}")
# Display both widgets (table and AUC-ROC message) side by side
display(widgets.VBox([table_widget, auc_widget]))
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred_adjusted)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
from huggingface_hub import hf_hub_download
import joblib
model_path = hf_hub_download(repo_id="AnguloM/LoanSmart_Predict_Loan_Approval_with_Confidence", filename="loan_approval_pipeline.pkl")
pipeline = joblib.load(model_path)
# Prediction function
def predict_approval(int_rate, installment, log_annual_inc, dti, fico,
days_with_cr_line, revol_bal, revol_util, inq_last_6mths,
delinq_2yrs, pub_rec, purpose):
# Prepare the input as a DataFrame
input_data = pd.DataFrame([[int_rate, installment, log_annual_inc, dti, fico,
days_with_cr_line, revol_bal, revol_util,
inq_last_6mths, delinq_2yrs, pub_rec, purpose]],
columns=['int.rate', 'installment', '',
'dti', 'fico', '', 'revol.bal',
'revol.util', 'inq.last.6mths', 'delinq.2yrs',
'pub.rec', 'purpose'])
# Make loan approval prediction
result = pipeline.predict(input_data)[0]
return "Loan Approved" if result == 1 else "Loan Not Approved"
# Create input components for the Gradio interface
inputs = [
gr.Slider(0.0, 25.0, step=0.1, label="Interest Rate (%)"),
gr.Slider(0.0, 1000.0, step=10.0, label="Installment Amount"),
gr.Slider(0.0, 15.0, step=0.1, label="Log of Annual Income"),
gr.Slider(0.0, 50.0, step=0.1, label="Debt-to-Income Ratio"),
gr.Slider(300, 850, step=1, label="FICO Credit Score"),
gr.Slider(0.0, 50000.0, step=100.0, label="Days with Credit Line"),
gr.Slider(0.0, 100000.0, step=500.0, label="Revolving Balance"),
gr.Slider(0.0, 150.0, step=0.1, label="Revolving Utilization (%)"),
gr.Slider(0, 10, step=1, label="Recent Inquiries (Last 6 Months)"),
gr.Slider(0, 10, step=1, label="Delinquencies in Last 2 Years"),
gr.Slider(0, 5, step=1, label="Public Records"),
gr.Dropdown(["credit_card", "debt_consolidation", "educational",
"home_improvement", "major_purchase", "small_business",
"other"], label="Loan Purpose")
# Create the Gradio interface for loan approval prediction
gr.Interface(fn=predict_approval, inputs=inputs, outputs="text").launch(share=True)