Spaces:

bacancydataprophets
/

Customer-Segmentation

Sleeping

App Files Files Community

Customer-Segmentation / data_preparation.py

simran0608

Update data_preparation.py

0a4a484 verified 4 months ago

raw

history blame

3.23 kB

	import pandas as pd
	import numpy as np
	def data_imp():
	feature_descriptions = {
	"CustID": "Unique identifier for each customer.",
	"FirstPolYear": "Year when the customer first bought an insurance policy.",
	"BirthYear": "Birth year of the customer, used to calculate age.",
	"EducDeg": "Highest educational degree obtained by the customer.",
	"MonthSal": "Monthly salary of the customer. (Numerical, float64)",
	"GeoLivArea": "Geographical area where the customer lives.",
	"Children": "Number of children the customer has.",
	"CustMonVal": "Total monetary value of the customer to the company.",
	"ClaimsRate": "Rate at which the customer files insurance claims.",
	"PremMotor": "Premium amount for motor insurance.",
	"PremHousehold": "Premium amount for household insurance.",
	"PremHealth": "Premium amount for health insurance.",
	"PremLife": "Premium amount for life insurance.",
	"PremWork": "Premium amount for work insurance."
	}
	insurance_defaults = {
	"FirstPolYear": 1999,
	"BirthYear": 1980,
	"MonthSal": 1000,
	"GeoLivArea": 0, # Options: 0, 1, 2, 3
	"Children": 0, # Options: 0, 1, 2
	"CustMonVal": 100,
	"ClaimsRate": 2.33,
	"PremMotor": 200,
	"PremHousehold": 200,
	"PremHealth": 200,
	"PremLife": 200,
	"PremWork": 200
	}

	# Define default values for banking dataset features
	banking_defaults = {
	"BALANCE": 2000,
	"BALANCE_FREQUENCY": 0.5,
	"PURCHASES": 500,
	"ONEOFF_PURCHASES": 0,
	"INSTALLMENTS_PURCHASES": 0,
	"CASH_ADVANCE": 200,
	"PURCHASES_FREQUENCY": 0.1,
	"ONEOFF_PURCHASES_FREQUENCY": 0.1,
	"PURCHASES_INSTALLMENTS_FREQUENCY": 0.5,
	"CASH_ADVANCE_FREQUENCY": 5,
	"CASH_ADVANCE_TRX": 5,
	"PURCHASES_TRX": 5,
	"CREDIT_LIMIT": 10000,
	"PAYMENTS": 500,
	"MINIMUM_PAYMENTS": 130,
	"PRC_FULL_PAYMENT": 0.22,
	"TENURE": 10
	}

	# Define default values for retail dataset features
	retail_defaults = {
	"Fresh": 6000,
	"Milk": 9000,
	"Grocery": 9000,
	"Frozen": 4000,
	"Detergents_Paper": 4000,
	"Delicassen": 2000
	}
	return feature_descriptions,insurance_defaults,banking_defaults,retail_defaults

	def preprocess_data(data):
	if 'CustID' in data.columns:
	data = data.drop(columns=['CustID'])
	if 'Channel' in data.columns:
	data = data.drop(columns=['Channel'])
	if 'Region' in data.columns:
	data = data.drop(columns=['Region'])


	data = remove_outliers(data)
	return data

	def remove_outliers(df, threshold=3):
	df_numeric = df.select_dtypes(include=[float, int])
	z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std())
	df_clean = df[(z_scores < threshold).all(axis=1)]
	return df_clean