import pandas as pd import numpy as np from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler class DataCleaner: def __init__(self): self.imputer = SimpleImputer(strategy='mean') self.scaler = StandardScaler() def clean(self, data): # Handle missing values data = self.handle_missing_values(data) # Remove outliers data = self.remove_outliers(data) # Normalize data data = self.normalize_data(data) return data def handle_missing_values(self, data): numeric_columns = data.select_dtypes(include=[np.number]).columns data[numeric_columns] = self.imputer.fit_transform(data[numeric_columns]) return data def remove_outliers(self, data): numeric_columns = data.select_dtypes(include=[np.number]).columns for column in numeric_columns: Q1 = data[column].quantile(0.25) Q3 = data[column].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)] return data def normalize_data(self, data): numeric_columns = data.select_dtypes(include=[np.number]).columns data[numeric_columns] = self.scaler.fit_transform(data[numeric_columns]) return data