|
import pandas as pd |
|
import numpy as np |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
class DataCleaner: |
|
def __init__(self): |
|
self.imputer = SimpleImputer(strategy='mean') |
|
self.scaler = StandardScaler() |
|
|
|
def clean(self, data): |
|
|
|
data = self.handle_missing_values(data) |
|
|
|
|
|
data = self.remove_outliers(data) |
|
|
|
|
|
data = self.normalize_data(data) |
|
|
|
return data |
|
|
|
def handle_missing_values(self, data): |
|
numeric_columns = data.select_dtypes(include=[np.number]).columns |
|
data[numeric_columns] = self.imputer.fit_transform(data[numeric_columns]) |
|
return data |
|
|
|
def remove_outliers(self, data): |
|
numeric_columns = data.select_dtypes(include=[np.number]).columns |
|
for column in numeric_columns: |
|
Q1 = data[column].quantile(0.25) |
|
Q3 = data[column].quantile(0.75) |
|
IQR = Q3 - Q1 |
|
lower_bound = Q1 - 1.5 * IQR |
|
upper_bound = Q3 + 1.5 * IQR |
|
data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)] |
|
return data |
|
|
|
def normalize_data(self, data): |
|
numeric_columns = data.select_dtypes(include=[np.number]).columns |
|
data[numeric_columns] = self.scaler.fit_transform(data[numeric_columns]) |
|
return data |