{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt \n", "import seaborn as sns\n", "\n", "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "path_data = r\"C:\\Users\\LaurèneDAVID\\Documents\\Teaching\\Educational_apps\\app-hec-AI-DS\\data\\classification\\credit_score.csv\"\n", "credit_data = pd.read_csv(path_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Credit score classification \n", "\n", "https://www.kaggle.com/datasets/parisrohan/credit-score-classification" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data cleaning" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "type_loan_clean = []\n", "for text in list(credit_data[\"Type_of_Loan\"].unique()):\n", " text = text.replace(\",\",\" \")\n", " text = text.replace(\"loan\",\"\")\n", " text = text.replace(\"not specified\",\"\")\n", " text = text.replace(\"No Data\",\"\")\n", " text = text.replace(\" \",\" \")\n", " text = text.strip()\n", " \n", " type_loan_clean.append(text)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "all_type_loan = \" \".join(type_loan_clean)\n", "types_loan = list(set([loan for loan in all_type_loan.split(\" \") if loan !=\"\"]))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "for loan in types_loan:\n", " credit_data[f\"{loan.capitalize()}Loan\"] = credit_data[\"Type_of_Loan\"].apply(lambda x: 1 if loan in x else 0)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Clean data\n", "credit_data.drop(columns=[\"SSN\",\"Name\",\"Month\",\"ID\",\"Type_of_Loan\",\"Payment_Behaviour\"], inplace=True)\n", "credit_data = credit_data.loc[~credit_data[\"Occupation\"].isin([\"Media_Manager\",\"Manager\"])]\n", "credit_data[\"Age\"] = credit_data[\"Age\"].astype(int)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Group customer data for numerical variables (mean)\n", "num_columns = list(credit_data.select_dtypes(include=[float]).columns)\n", "credit_data_num = credit_data[[\"Customer_ID\"] + num_columns].groupby(\"Customer_ID\").median().reset_index().round()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Group customer data for categorical variables (mode)\n", "cat_columns = list(credit_data.select_dtypes(include=[int,object]).columns)\n", "credit_data_cat = credit_data[cat_columns].groupby(\"Customer_ID\").apply(lambda group: group.mode().iloc[0] if not group.mode().empty else pd.Series(index=group.columns))\n", "credit_data_cat = credit_data_cat.drop(columns=[\"Customer_ID\"]).reset_index()" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "# Merge numerical and categorical merged datasets\n", "credit_data_clean = credit_data_cat.merge(credit_data_num, on=\"Customer_ID\", how=\"inner\") # merge cleaned data for numerical and categorical variables\n", "#credit_data_clean = credit_data_clean.sample(5000, random_state=0) # select 2000 samples randomly\n", "credit_data_clean.reset_index(drop=True, inplace=True)\n", "credit_data_clean.drop(columns=[\"Customer_ID\"], inplace=True)" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "credit_data_clean = credit_data_clean.loc[credit_data_clean[\"Age\"]>18]" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Age | \n", "Occupation | \n", "Credit_Mix | \n", "Payment_of_Min_Amount | \n", "Credit_Score | \n", "HomeLoan | \n", "Credit-builderLoan | \n", "MortgageLoan | \n", "ConsolidationLoan | \n", "EquityLoan | \n", "... | \n", "Delay_from_due_date | \n", "Num_of_Delayed_Payment | \n", "Changed_Credit_Limit | \n", "Num_Credit_Inquiries | \n", "Outstanding_Debt | \n", "Credit_Utilization_Ratio | \n", "Credit_History_Age | \n", "Total_EMI_per_month | \n", "Amount_invested_monthly | \n", "Monthly_Balance | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "38.0 | \n", "Journalist | \n", "Standard | \n", "Yes | \n", "Poor | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "48.0 | \n", "12.0 | \n", "11.0 | \n", "8.0 | \n", "1942.0 | \n", "33.0 | \n", "186.0 | \n", "27.0 | \n", "45.0 | \n", "317.0 | \n", "
1 | \n", "37.0 | \n", "Developer | \n", "Standard | \n", "Yes | \n", "Standard | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "26.0 | \n", "11.0 | \n", "14.0 | \n", "10.0 | \n", "1139.0 | \n", "28.0 | \n", "296.0 | \n", "0.0 | \n", "51.0 | \n", "362.0 | \n", "
2 | \n", "22.0 | \n", "Accountant | \n", "Standard | \n", "Yes | \n", "Standard | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "19.0 | \n", "18.0 | \n", "17.0 | \n", "7.0 | \n", "982.0 | \n", "36.0 | \n", "334.0 | \n", "188.0 | \n", "101.0 | \n", "567.0 | \n", "
3 | \n", "44.0 | \n", "Writer | \n", "Standard | \n", "Yes | \n", "Standard | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "20.0 | \n", "14.0 | \n", "15.0 | \n", "6.0 | \n", "1372.0 | \n", "35.0 | \n", "182.0 | \n", "258.0 | \n", "102.0 | \n", "469.0 | \n", "
4 | \n", "30.0 | \n", "Mechanic | \n", "Good | \n", "No | \n", "Standard | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "9.0 | \n", "2.0 | \n", "3.0 | \n", "1.0 | \n", "1071.0 | \n", "33.0 | \n", "314.0 | \n", "62.0 | \n", "53.0 | \n", "400.0 | \n", "
5 rows × 31 columns
\n", "\n", " | Annual_Income | \n", "Num_Bank_Accounts | \n", "Num_Credit_Card | \n", "Interest_Rate | \n", "Num_of_Loan | \n", "Delay_from_due_date | \n", "Num_of_Delayed_Payment | \n", "Changed_Credit_Limit | \n", "Outstanding_Debt | \n", "Credit_History_Age | \n", "
---|---|---|---|---|---|---|---|---|---|---|
count | \n", "9943.00 | \n", "9943.00 | \n", "9943.00 | \n", "9943.00 | \n", "9943.00 | \n", "9943.00 | \n", "9943.00 | \n", "9943.00 | \n", "9943.00 | \n", "9943.00 | \n", "
mean | \n", "51265.81 | \n", "5.28 | \n", "5.48 | \n", "14.14 | \n", "3.43 | \n", "20.51 | \n", "13.06 | \n", "10.18 | \n", "1379.43 | \n", "225.71 | \n", "
std | \n", "38778.67 | \n", "2.60 | \n", "2.04 | \n", "8.69 | \n", "2.42 | \n", "14.53 | \n", "6.21 | \n", "6.45 | \n", "1136.02 | \n", "99.17 | \n", "
min | \n", "7006.00 | \n", "0.00 | \n", "0.00 | \n", "1.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "0.00 | \n", "4.00 | \n", "
25% | \n", "19594.50 | \n", "3.00 | \n", "4.00 | \n", "7.00 | \n", "2.00 | \n", "10.00 | \n", "9.00 | \n", "5.00 | \n", "543.00 | \n", "152.00 | \n", "
50% | \n", "37580.00 | \n", "5.00 | \n", "5.00 | \n", "12.00 | \n", "3.00 | \n", "17.00 | \n", "13.00 | \n", "9.00 | \n", "1123.00 | \n", "224.00 | \n", "
75% | \n", "72899.50 | \n", "7.00 | \n", "7.00 | \n", "20.00 | \n", "5.00 | \n", "27.00 | \n", "18.00 | \n", "14.00 | \n", "1823.00 | \n", "306.00 | \n", "
max | \n", "179987.00 | \n", "10.00 | \n", "11.00 | \n", "34.00 | \n", "9.00 | \n", "62.00 | \n", "25.00 | \n", "30.00 | \n", "4998.00 | \n", "400.00 | \n", "
\n", " | Annual_Income | \n", "Num_Bank_Accounts | \n", "Num_Credit_Card | \n", "Interest_Rate | \n", "Num_of_Loan | \n", "Delay_from_due_date | \n", "Num_of_Delayed_Payment | \n", "Changed_Credit_Limit | \n", "Outstanding_Debt | \n", "Credit_History_Age | \n", "... | \n", "Occupation_Musician | \n", "Occupation_Scientist | \n", "Occupation_Teacher | \n", "Occupation_Writer | \n", "Credit_Mix_Bad | \n", "Credit_Mix_Good | \n", "Credit_Mix_Standard | \n", "Payment_of_Min_Amount_NM | \n", "Payment_of_Min_Amount_No | \n", "Payment_of_Min_Amount_Yes | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.472788 | \n", "-1.25 | \n", "-0.333333 | \n", "-0.846154 | \n", "-0.333333 | \n", "-0.388889 | \n", "-0.111111 | \n", "-0.777778 | \n", "-0.640219 | \n", "1.116883 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "
1 | \n", "-0.074266 | \n", "-0.25 | \n", "-0.333333 | \n", "-0.307692 | \n", "-1.000000 | \n", "-0.388889 | \n", "-1.000000 | \n", "-0.444444 | \n", "-0.639437 | \n", "-0.051948 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "
2 | \n", "1.693351 | \n", "-0.50 | \n", "-0.666667 | \n", "-0.384615 | \n", "-1.000000 | \n", "0.277778 | \n", "0.111111 | \n", "1.222222 | \n", "-0.727770 | \n", "-0.129870 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
3 | \n", "-0.039409 | \n", "0.00 | \n", "0.000000 | \n", "-0.692308 | \n", "-0.666667 | \n", "-0.388889 | \n", "-1.333333 | \n", "0.000000 | \n", "-0.314247 | \n", "0.389610 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "
4 | \n", "0.402867 | \n", "1.00 | \n", "1.333333 | \n", "1.538462 | \n", "2.000000 | \n", "0.944444 | \n", "0.444444 | \n", "0.777778 | \n", "1.212429 | \n", "-0.961039 | \n", "... | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
5 rows × 30 columns
\n", "LGBMClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LGBMClassifier()