import pandas as pd import numpy as np def data_imp(): insurance_feature_descriptions = { "CustID": "Unique identifier for each customer.", "FirstPolYear": "Year when the customer first bought an insurance policy.", "BirthYear": "Birth year of the customer, used to calculate age.", "EducDeg": "Highest educational degree obtained by the customer.", "MonthSal": "Monthly salary of the customer. (Numerical, float64)", "GeoLivArea": "Geographical area where the customer lives.", "Children": "Number of children the customer has.", "CustMonVal": "Total monetary value of the customer to the company.", "ClaimsRate": "Rate at which the customer files insurance claims.", "PremMotor": "Premium amount for motor insurance.", "PremHousehold": "Premium amount for household insurance.", "PremHealth": "Premium amount for health insurance.", "PremLife": "Premium amount for life insurance.", "PremWork": "Premium amount for work insurance." } retail_feature_descriptions = { "Channel": "Indicates the sales channel through which the customer made purchases.", "Region": "The geographical region where the customer is located.", "Fresh": "Annual spending (in monetary units) on fresh products.", "Milk": "Annual spending (in monetary units) on milk products.", "Grocery": "Annual spending (in monetary units) on grocery items.", "Frozen": "Annual spending (in monetary units) on frozen products.", "Detergents_Paper": "Annual spending (in monetary units) on detergents and paper products.", "Delicassen": "Annual spending (in monetary units) on delicatessen products." } bankng_feature_descriptions = { "CUST_ID": "Unique identifier for each customer.", "BALANCE": "The average balance left in the customer's account.", "BALANCE_FREQUENCY": "Frequency with which the balance is updated.", "PURCHASES": "The total amount of purchases made by the customer.", "ONEOFF_PURCHASES": "The total amount of one-time purchases made by the customer.", "INSTALLMENTS_PURCHASES": "The total amount of purchases made in installments.", "CASH_ADVANCE": "The total amount of cash advances taken by the customer.", "PURCHASES_FREQUENCY": "The frequency of purchases made by the customer.", "ONEOFF_PURCHASES_FREQUENCY": "The frequency of one-time purchases made by the customer.", "PURCHASES_INSTALLMENTS_FREQUENCY": "The frequency of purchases made in installments.", "CASH_ADVANCE_FREQUENCY": "The frequency of cash advances taken by the customer.", "CASH_ADVANCE_TRX": "The number of cash advance transactions made by the customer.", "PURCHASES_TRX": "The number of purchase transactions made by the customer.", "CREDIT_LIMIT": "The credit limit assigned to the customer's account.", "PAYMENTS": "The total amount of payments made by the customer.", "MINIMUM_PAYMENTS": "The minimum amount of payments made by the customer.", "PRC_FULL_PAYMENT": "The percentage of full payments made by the customer.", "TENURE": "The tenure of the customer in months." } insurance_defaults = { "FirstPolYear": 1999, "BirthYear": 1980, "MonthSal": 1000, "GeoLivArea": 0, # Options: 0, 1, 2, 3 "Children": 0, # Options: 0, 1, 2 "CustMonVal": 100, "ClaimsRate": 2.33, "PremMotor": 200, "PremHousehold": 200, "PremHealth": 200, "PremLife": 200, "PremWork": 200 } # Define default values for banking dataset features banking_defaults = { "BALANCE": 2000, "BALANCE_FREQUENCY": 0.5, "PURCHASES": 500, "ONEOFF_PURCHASES": 0, "INSTALLMENTS_PURCHASES": 0, "CASH_ADVANCE": 200, "PURCHASES_FREQUENCY": 0.1, "ONEOFF_PURCHASES_FREQUENCY": 0.1, "PURCHASES_INSTALLMENTS_FREQUENCY": 0.5, "CASH_ADVANCE_FREQUENCY": 5, "CASH_ADVANCE_TRX": 5, "PURCHASES_TRX": 5, "CREDIT_LIMIT": 10000, "PAYMENTS": 500, "MINIMUM_PAYMENTS": 130, "PRC_FULL_PAYMENT": 0.22, "TENURE": 10 } # Define default values for retail dataset features retail_defaults = { "Fresh": 6000, "Milk": 9000, "Grocery": 9000, "Frozen": 4000, "Detergents_Paper": 4000, "Delicassen": 2000 } return insurance_feature_descriptions,bankng_feature_descriptions,retail_feature_descriptions,insurance_defaults,banking_defaults,retail_defaults def preprocess_data(data): if 'CustID' in data.columns: data = data.drop(columns=['CustID']) if 'Cust_ID' in data.columns: data = data.drop(columns=['Cust_ID']) data = remove_outliers(data) return data def remove_outliers(df, threshold=3): df_numeric = df.select_dtypes(include=[float, int]) z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std()) df_clean = df[(z_scores < threshold).all(axis=1)] return df_clean