|
import pandas as pd |
|
import numpy as np |
|
def data_imp(): |
|
feature_descriptions = { |
|
"CustID": "Unique identifier for each customer.", |
|
"FirstPolYear": "Year when the customer first bought an insurance policy.", |
|
"BirthYear": "Birth year of the customer, used to calculate age.", |
|
"EducDeg": "Highest educational degree obtained by the customer.", |
|
"MonthSal": "Monthly salary of the customer. (Numerical, float64)", |
|
"GeoLivArea": "Geographical area where the customer lives.", |
|
"Children": "Number of children the customer has.", |
|
"CustMonVal": "Total monetary value of the customer to the company.", |
|
"ClaimsRate": "Rate at which the customer files insurance claims.", |
|
"PremMotor": "Premium amount for motor insurance.", |
|
"PremHousehold": "Premium amount for household insurance.", |
|
"PremHealth": "Premium amount for health insurance.", |
|
"PremLife": "Premium amount for life insurance.", |
|
"PremWork": "Premium amount for work insurance." |
|
} |
|
insurance_defaults = { |
|
"FirstPolYear": 1999, |
|
"BirthYear": 1980, |
|
"MonthSal": 1000, |
|
"GeoLivArea": 0, |
|
"Children": 0, |
|
"CustMonVal": 100, |
|
"ClaimsRate": 2.33, |
|
"PremMotor": 200, |
|
"PremHousehold": 200, |
|
"PremHealth": 200, |
|
"PremLife": 200, |
|
"PremWork": 200 |
|
} |
|
|
|
|
|
banking_defaults = { |
|
"BALANCE": 2000, |
|
"BALANCE_FREQUENCY": 0.5, |
|
"PURCHASES": 500, |
|
"ONEOFF_PURCHASES": 0, |
|
"INSTALLMENTS_PURCHASES": 0, |
|
"CASH_ADVANCE": 200, |
|
"PURCHASES_FREQUENCY": 0.1, |
|
"ONEOFF_PURCHASES_FREQUENCY": 0.1, |
|
"PURCHASES_INSTALLMENTS_FREQUENCY": 0.5, |
|
"CASH_ADVANCE_FREQUENCY": 5, |
|
"CASH_ADVANCE_TRX": 5, |
|
"PURCHASES_TRX": 5, |
|
"CREDIT_LIMIT": 10000, |
|
"PAYMENTS": 500, |
|
"MINIMUM_PAYMENTS": 130, |
|
"PRC_FULL_PAYMENT": 0.22, |
|
"TENURE": 10 |
|
} |
|
|
|
|
|
retail_defaults = { |
|
"Fresh": 6000, |
|
"Milk": 9000, |
|
"Grocery": 9000, |
|
"Frozen": 4000, |
|
"Detergents_Paper": 4000, |
|
"Delicassen": 2000 |
|
} |
|
return feature_descriptions,insurance_defaults,banking_defaults,retail_defaults |
|
|
|
def preprocess_data(data): |
|
if 'CustID' in data.columns: |
|
data = data.drop(columns=['CustID']) |
|
if 'Channel' in data.columns: |
|
data = data.drop(columns=['Channel']) |
|
if 'Region' in data.columns: |
|
data = data.drop(columns=['Region']) |
|
|
|
|
|
data = remove_outliers(data) |
|
return data |
|
|
|
def remove_outliers(df, threshold=3): |
|
df_numeric = df.select_dtypes(include=[float, int]) |
|
z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std()) |
|
df_clean = df[(z_scores < threshold).all(axis=1)] |
|
return df_clean |
|
|