simran0608 commited on
Commit
0a4a484
1 Parent(s): 1107b8f

Update data_preparation.py

Browse files
Files changed (1) hide show
  1. data_preparation.py +83 -20
data_preparation.py CHANGED
@@ -1,20 +1,83 @@
1
- import pandas as pd
2
- import numpy as np
3
-
4
- def preprocess_data(data):
5
- if 'CustID' in data.columns:
6
- data = data.drop(columns=['CustID'])
7
- if 'Channel' in data.columns:
8
- data = data.drop(columns=['Channel'])
9
- if 'Region' in data.columns:
10
- data = data.drop(columns=['Region'])
11
-
12
-
13
- data = remove_outliers(data)
14
- return data
15
-
16
- def remove_outliers(df, threshold=3):
17
- df_numeric = df.select_dtypes(include=[float, int])
18
- z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std())
19
- df_clean = df[(z_scores < threshold).all(axis=1)]
20
- return df_clean
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ def data_imp():
4
+ feature_descriptions = {
5
+ "CustID": "Unique identifier for each customer.",
6
+ "FirstPolYear": "Year when the customer first bought an insurance policy.",
7
+ "BirthYear": "Birth year of the customer, used to calculate age.",
8
+ "EducDeg": "Highest educational degree obtained by the customer.",
9
+ "MonthSal": "Monthly salary of the customer. (Numerical, float64)",
10
+ "GeoLivArea": "Geographical area where the customer lives.",
11
+ "Children": "Number of children the customer has.",
12
+ "CustMonVal": "Total monetary value of the customer to the company.",
13
+ "ClaimsRate": "Rate at which the customer files insurance claims.",
14
+ "PremMotor": "Premium amount for motor insurance.",
15
+ "PremHousehold": "Premium amount for household insurance.",
16
+ "PremHealth": "Premium amount for health insurance.",
17
+ "PremLife": "Premium amount for life insurance.",
18
+ "PremWork": "Premium amount for work insurance."
19
+ }
20
+ insurance_defaults = {
21
+ "FirstPolYear": 1999,
22
+ "BirthYear": 1980,
23
+ "MonthSal": 1000,
24
+ "GeoLivArea": 0, # Options: 0, 1, 2, 3
25
+ "Children": 0, # Options: 0, 1, 2
26
+ "CustMonVal": 100,
27
+ "ClaimsRate": 2.33,
28
+ "PremMotor": 200,
29
+ "PremHousehold": 200,
30
+ "PremHealth": 200,
31
+ "PremLife": 200,
32
+ "PremWork": 200
33
+ }
34
+
35
+ # Define default values for banking dataset features
36
+ banking_defaults = {
37
+ "BALANCE": 2000,
38
+ "BALANCE_FREQUENCY": 0.5,
39
+ "PURCHASES": 500,
40
+ "ONEOFF_PURCHASES": 0,
41
+ "INSTALLMENTS_PURCHASES": 0,
42
+ "CASH_ADVANCE": 200,
43
+ "PURCHASES_FREQUENCY": 0.1,
44
+ "ONEOFF_PURCHASES_FREQUENCY": 0.1,
45
+ "PURCHASES_INSTALLMENTS_FREQUENCY": 0.5,
46
+ "CASH_ADVANCE_FREQUENCY": 5,
47
+ "CASH_ADVANCE_TRX": 5,
48
+ "PURCHASES_TRX": 5,
49
+ "CREDIT_LIMIT": 10000,
50
+ "PAYMENTS": 500,
51
+ "MINIMUM_PAYMENTS": 130,
52
+ "PRC_FULL_PAYMENT": 0.22,
53
+ "TENURE": 10
54
+ }
55
+
56
+ # Define default values for retail dataset features
57
+ retail_defaults = {
58
+ "Fresh": 6000,
59
+ "Milk": 9000,
60
+ "Grocery": 9000,
61
+ "Frozen": 4000,
62
+ "Detergents_Paper": 4000,
63
+ "Delicassen": 2000
64
+ }
65
+ return feature_descriptions,insurance_defaults,banking_defaults,retail_defaults
66
+
67
+ def preprocess_data(data):
68
+ if 'CustID' in data.columns:
69
+ data = data.drop(columns=['CustID'])
70
+ if 'Channel' in data.columns:
71
+ data = data.drop(columns=['Channel'])
72
+ if 'Region' in data.columns:
73
+ data = data.drop(columns=['Region'])
74
+
75
+
76
+ data = remove_outliers(data)
77
+ return data
78
+
79
+ def remove_outliers(df, threshold=3):
80
+ df_numeric = df.select_dtypes(include=[float, int])
81
+ z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std())
82
+ df_clean = df[(z_scores < threshold).all(axis=1)]
83
+ return df_clean