import pandas as pd import numpy as np from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler class DataProcessor: def __init__(self, df): self.df = df def clean_data(self): # Remove duplicates self.df = self.df.drop_duplicates() # Handle missing values numeric_columns = self.df.select_dtypes(include=[np.number]).columns categorical_columns = self.df.select_dtypes(include=['object']).columns # Impute numeric columns with mean num_imputer = SimpleImputer(strategy='mean') self.df[numeric_columns] = num_imputer.fit_transform(self.df[numeric_columns]) # Impute categorical columns with mode cat_imputer = SimpleImputer(strategy='most_frequent') self.df[categorical_columns] = cat_imputer.fit_transform(self.df[categorical_columns]) # Normalize numeric columns scaler = StandardScaler() self.df[numeric_columns] = scaler.fit_transform(self.df[numeric_columns]) return self.df def get_columns_with_missing_values(self): return self.df.columns[self.df.isnull().any()].tolist() def detect_outliers(self, column, method='zscore', threshold=3): if method == 'zscore': z_scores = np.abs((self.df[column] - self.df[column].mean()) / self.df[column].std()) return self.df[z_scores > threshold] elif method == 'iqr': Q1 = self.df[column].quantile(0.25) Q3 = self.df[column].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR return self.df[(self.df[column] < lower_bound) | (self.df[column] > upper_bound)]