# -*- coding: utf-8 -*- """loan.py""" # Import necessary libraries from IPython.display import display import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import warnings import ipywidgets as widgets from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from imblearn.over_sampling import SMOTE from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score import gradio as gr from imblearn.pipeline import Pipeline as ImbPipeline import joblib from datasets import load_dataset # Import the Hugging Face dataset library # Suppress specific FutureWarnings warnings.simplefilter(action='ignore', category=FutureWarning) # Load dataset directly from Hugging Face dataset = load_dataset("AnguloM/loan_data") # Access the train and test data df_train = dataset['train'] # Convert dataset to pandas DataFrame df_train = pd.DataFrame(df_train) from sklearn.model_selection import train_test_split df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42) # Create a summary DataFrame with data types and non-null counts info_df = pd.DataFrame({ "Column": df_train.columns, "Data Type": df_train.dtypes, "Non-Null Count": df_train.notnull().sum(), "Total Count": len(df_train) }) # Calculate the percentage of non-null values in each column info_df['Non-Null Percentage'] = (info_df['Non-Null Count'] / info_df['Total Count'] * 100).round(2).astype(str) + '%' # Style the table info_df_styled = info_df.style.set_properties(**{'text-align': 'left'}).set_table_styles( [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}] ) # Apply background gradient only to numerical columns info_df_styled = info_df_styled.background_gradient(subset=['Non-Null Count', 'Total Count'], cmap="Oranges") # Create a widget to display the styled table table_widget = widgets.Output() with table_widget: display(info_df_styled) # Widget for the missing values message message_widget = widgets.Output() with message_widget: print(f"\033[1;31mMissing values detected in any columns:\033[0m\n{df_train.isnull().sum()}") # Display both widgets (table and missing values message) side by side widgets.HBox([table_widget, message_widget]) # Convert relevant columns to categorical if necessary df_train['not.fully.paid'] = df_train['not.fully.paid'].astype('category') # Select only numeric columns for correlation matrix calculation df_numeric = df_train.select_dtypes(include=[float, int]) # Create a 1x2 grid for the plots plt.figure(figsize=(12, 6)) # Create subplots for the correlation matrix and target distribution fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # Plot Correlation Matrix sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', ax=axes[0], fmt='.2f') axes[0].set_title('Correlation Matrix') # Plot Distribution of Loan Repayment Status (Target Variable) sns.countplot(x='not.fully.paid', data=df_train, ax=axes[1]) axes[1].set_title('Distribution of Loan Repayment Status') # Show the plots plt.tight_layout() # Adjusts the layout to avoid overlapping plt.show() # OneHotEncoding for categorical columns and scaling for numeric columns # Prepare data for training data = df_train.copy() # Separate features (X) and target (y) X = data.drop('credit.policy', axis=1) # Drop the target column y = data['credit.policy'] # Target variable # Split the data into training (80%) and testing (20%) sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Preprocessing pipeline (scaling numeric features and encoding categorical features) preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec']), ('cat', OneHotEncoder(), ['purpose']) # Ensure 'purpose' is included in categorical transformations ] ) # Create an imbalanced-learn pipeline that includes SMOTE and Logistic Regression imb_model_pipeline = ImbPipeline(steps=[ ('preprocessor', preprocessor), # First, preprocess the data (scale numeric, encode categorical) ('smote', SMOTE(random_state=42, sampling_strategy=0.5)), # Apply SMOTE to balance the dataset ('classifier', LogisticRegression(max_iter=1000000)) # Logistic Regression classifier ]) # Train the model with the full pipeline (preprocessing + SMOTE + model training) imb_model_pipeline.fit(X_train, y_train) # Make predictions on the test data y_pred = imb_model_pipeline.predict(X_test) y_pred_proba = imb_model_pipeline.predict_proba(X_test)[:, 1] # Get probabilities for the positive class # Adjust the decision threshold to improve recall of the positive class threshold = 0.3 y_pred_adjusted = (y_pred_proba >= threshold).astype(int) # Evaluate the model using classification report classification_rep = classification_report(y_test, y_pred_adjusted, output_dict=True) # Convert the classification report to a DataFrame for display as a table with styles classification_df = pd.DataFrame(classification_rep).transpose() classification_df_styled = classification_df.style.set_properties(**{'text-align': 'center'}).set_table_styles( [{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}] ) # Display the classification report as a styled table in a widget table_widget = widgets.Output() with table_widget: display(classification_df_styled) # Calculate the AUC-ROC score auc_roc = roc_auc_score(y_test, y_pred_proba) # Widget for the AUC-ROC auc_widget = widgets.Output() with auc_widget: print("\033[1;31mAUC-ROC:\033[0m", f"{auc_roc:.4f}") # Display both widgets (table and AUC-ROC message) side by side display(widgets.VBox([table_widget, auc_widget])) # Display the confusion matrix cm = confusion_matrix(y_test, y_pred_adjusted) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.title("Confusion Matrix") plt.xlabel("Predicted") plt.ylabel("Actual") plt.show() from huggingface_hub import hf_hub_download import joblib model_path = hf_hub_download(repo_id="AnguloM/LoanSmart_Predict_Loan_Approval_with_Confidence", filename="loan_approval_pipeline.pkl") pipeline = joblib.load(model_path) # Prediction function def predict_approval(int_rate, installment, log_annual_inc, dti, fico, days_with_cr_line, revol_bal, revol_util, inq_last_6mths, delinq_2yrs, pub_rec, purpose): # Prepare the input as a DataFrame input_data = pd.DataFrame([[int_rate, installment, log_annual_inc, dti, fico, days_with_cr_line, revol_bal, revol_util, inq_last_6mths, delinq_2yrs, pub_rec, purpose]], columns=['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'purpose']) # Make loan approval prediction result = pipeline.predict(input_data)[0] return "Loan Approved" if result == 1 else "Loan Not Approved" # Create input components for the Gradio interface inputs = [ gr.Slider(0.0, 25.0, step=0.1, label="Interest Rate (%)"), gr.Slider(0.0, 1000.0, step=10.0, label="Installment Amount"), gr.Slider(0.0, 15.0, step=0.1, label="Log of Annual Income"), gr.Slider(0.0, 50.0, step=0.1, label="Debt-to-Income Ratio"), gr.Slider(300, 850, step=1, label="FICO Credit Score"), gr.Slider(0.0, 50000.0, step=100.0, label="Days with Credit Line"), gr.Slider(0.0, 100000.0, step=500.0, label="Revolving Balance"), gr.Slider(0.0, 150.0, step=0.1, label="Revolving Utilization (%)"), gr.Slider(0, 10, step=1, label="Recent Inquiries (Last 6 Months)"), gr.Slider(0, 10, step=1, label="Delinquencies in Last 2 Years"), gr.Slider(0, 5, step=1, label="Public Records"), gr.Dropdown(["credit_card", "debt_consolidation", "educational", "home_improvement", "major_purchase", "small_business", "other"], label="Loan Purpose") ] # Create the Gradio interface for loan approval prediction gr.Interface(fn=predict_approval, inputs=inputs, outputs="text").launch(share=True)