Upload loan.py

3dfb3f9 verified 3 months ago

8.6 kB

	# -- coding: utf-8 --
	"""loan.py"""

	# Import necessary libraries
	import numpy as np
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import warnings
	from sklearn.preprocessing import OneHotEncoder, StandardScaler
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from imblearn.over_sampling import SMOTE
	from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
	import gradio as gr
	from imblearn.pipeline import Pipeline as ImbPipeline
	import joblib
	from datasets import load_dataset # Import the Hugging Face dataset library

	# Suppress specific FutureWarnings
	warnings.simplefilter(action='ignore', category=FutureWarning)

	# Load dataset directly from Hugging Face
	dataset = load_dataset("AnguloM/loan_data")

	# Access the train and test data
	df_train = dataset['train']


	# Convert dataset to pandas DataFrame
	df_train = pd.DataFrame(df_train)

	from sklearn.model_selection import train_test_split

	df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42)


	# Create a summary DataFrame with data types and non-null counts
	info_df = pd.DataFrame({
	"Column": df_train.columns,
	"Data Type": df_train.dtypes,
	"Non-Null Count": df_train.notnull().sum(),
	"Total Count": len(df_train)
	})

	# Calculate the percentage of non-null values in each column
	info_df['Non-Null Percentage'] = (info_df['Non-Null Count'] / info_df['Total Count'] * 100).round(2).astype(str) + '%'

	# Style the table
	info_df_styled = info_df.style.set_properties(**{'text-align': 'left'}).set_table_styles(
	[{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
	)

	# Apply background gradient only to numerical columns
	info_df_styled = info_df_styled.background_gradient(subset=['Non-Null Count', 'Total Count'], cmap="Oranges")

	# Create a widget to display the styled table
	table_widget = widgets.Output()
	with table_widget:
	display(info_df_styled)

	# Widget for the missing values message
	message_widget = widgets.Output()
	with message_widget:
	print(f"\033[1;31mMissing values detected in any columns:\033[0m\n{df_train.isnull().sum()}")

	# Display both widgets (table and missing values message) side by side
	widgets.HBox([table_widget, message_widget])

	# Convert relevant columns to categorical if necessary
	df_train['not.fully.paid'] = df_train['not.fully.paid'].astype('category')

	# Select only numeric columns for correlation matrix calculation
	df_numeric = df_train.select_dtypes(include=[float, int])

	# Create a 1x2 grid for the plots
	plt.figure(figsize=(12, 6))

	# Create subplots for the correlation matrix and target distribution
	fig, axes = plt.subplots(1, 2, figsize=(14, 6))

	# Plot Correlation Matrix
	sns.heatmap(df_numeric.corr(), annot=True, cmap='coolwarm', ax=axes[0], fmt='.2f')
	axes[0].set_title('Correlation Matrix')

	# Plot Distribution of Loan Repayment Status (Target Variable)
	sns.countplot(x='not.fully.paid', data=df_train, ax=axes[1])
	axes[1].set_title('Distribution of Loan Repayment Status')

	# Show the plots
	plt.tight_layout() # Adjusts the layout to avoid overlapping
	plt.show()

	# OneHotEncoding for categorical columns and scaling for numeric columns
	# Prepare data for training
	data = df_train.copy()

	# Separate features (X) and target (y)
	X = data.drop('credit.policy', axis=1) # Drop the target column
	y = data['credit.policy'] # Target variable

	# Split the data into training (80%) and testing (20%) sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Preprocessing pipeline (scaling numeric features and encoding categorical features)
	preprocessor = ColumnTransformer(
	transformers=[
	('num', StandardScaler(), ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico',
	'days.with.cr.line', 'revol.bal', 'revol.util', 'inq.last.6mths',
	'delinq.2yrs', 'pub.rec']),
	('cat', OneHotEncoder(), ['purpose']) # Ensure 'purpose' is included in categorical transformations
	]
	)

	# Create an imbalanced-learn pipeline that includes SMOTE and Logistic Regression
	imb_model_pipeline = ImbPipeline(steps=[
	('preprocessor', preprocessor), # First, preprocess the data (scale numeric, encode categorical)
	('smote', SMOTE(random_state=42, sampling_strategy=0.5)), # Apply SMOTE to balance the dataset
	('classifier', LogisticRegression(max_iter=1000000)) # Logistic Regression classifier
	])

	# Train the model with the full pipeline (preprocessing + SMOTE + model training)
	imb_model_pipeline.fit(X_train, y_train)

	# Make predictions on the test data
	y_pred = imb_model_pipeline.predict(X_test)
	y_pred_proba = imb_model_pipeline.predict_proba(X_test)[:, 1] # Get probabilities for the positive class

	# Adjust the decision threshold to improve recall of the positive class
	threshold = 0.3
	y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

	# Evaluate the model using classification report
	classification_rep = classification_report(y_test, y_pred_adjusted, output_dict=True)

	# Convert the classification report to a DataFrame for display as a table with styles
	classification_df = pd.DataFrame(classification_rep).transpose()
	classification_df_styled = classification_df.style.set_properties(**{'text-align': 'center'}).set_table_styles(
	[{'selector': 'th', 'props': [('background-color', '#d9edf7'), ('color', '#31708f'), ('font-weight', 'bold')]}]
	)

	# Display the classification report as a styled table in a widget
	table_widget = widgets.Output()
	with table_widget:
	display(classification_df_styled)

	# Calculate the AUC-ROC score
	auc_roc = roc_auc_score(y_test, y_pred_proba)

	# Widget for the AUC-ROC
	auc_widget = widgets.Output()
	with auc_widget:
	print("\033[1;31mAUC-ROC:\033[0m", f"{auc_roc:.4f}")

	# Display both widgets (table and AUC-ROC message) side by side
	display(widgets.VBox([table_widget, auc_widget]))

	# Display the confusion matrix
	cm = confusion_matrix(y_test, y_pred_adjusted)
	sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
	plt.title("Confusion Matrix")
	plt.xlabel("Predicted")
	plt.ylabel("Actual")
	plt.show()

	from huggingface_hub import hf_hub_download
	import joblib


	model_path = hf_hub_download(repo_id="AnguloM/LoanSmart_Predict_Loan_Approval_with_Confidence", filename="loan_approval_pipeline.pkl")


	pipeline = joblib.load(model_path)
	# Prediction function
	def predict_approval(int_rate, installment, log_annual_inc, dti, fico,
	days_with_cr_line, revol_bal, revol_util, inq_last_6mths,
	delinq_2yrs, pub_rec, purpose):
	# Prepare the input as a DataFrame
	input_data = pd.DataFrame([[int_rate, installment, log_annual_inc, dti, fico,
	days_with_cr_line, revol_bal, revol_util,
	inq_last_6mths, delinq_2yrs, pub_rec, purpose]],
	columns=['int.rate', 'installment', 'log.annual.inc',
	'dti', 'fico', 'days.with.cr.line', 'revol.bal',
	'revol.util', 'inq.last.6mths', 'delinq.2yrs',
	'pub.rec', 'purpose'])
	# Make loan approval prediction
	result = pipeline.predict(input_data)
	return result[0]


	# Create input components for the Gradio interface
	inputs = [
	gr.Slider(0.0, 25.0, step=0.1, label="Interest Rate (%)"),
	gr.Slider(0.0, 1000.0, step=10.0, label="Installment Amount"),
	gr.Slider(0.0, 15.0, step=0.1, label="Log of Annual Income"),
	gr.Slider(0.0, 50.0, step=0.1, label="Debt-to-Income Ratio"),
	gr.Slider(300, 850, step=1, label="FICO Credit Score"),
	gr.Slider(0.0, 50000.0, step=100.0, label="Days with Credit Line"),
	gr.Slider(0.0, 100000.0, step=500.0, label="Revolving Balance"),
	gr.Slider(0.0, 150.0, step=0.1, label="Revolving Utilization (%)"),
	gr.Slider(0, 10, step=1, label="Recent Inquiries (Last 6 Months)"),
	gr.Slider(0, 10, step=1, label="Delinquencies in Last 2 Years"),
	gr.Slider(0, 5, step=1, label="Public Records"),
	gr.Dropdown(["credit_card", "debt_consolidation", "educational",
	"home_improvement", "major_purchase", "small_business",
	"other"], label="Loan Purpose")
	]

	# Create the Gradio interface for loan approval prediction
	gr.Interface(fn=predict_approval, inputs=inputs, outputs="text").launch(share=True)