|
import pandas as pd |
|
import numpy as np |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
class DataProcessor: |
|
def __init__(self, df): |
|
self.df = df |
|
|
|
def clean_data(self): |
|
|
|
self.df = self.df.drop_duplicates() |
|
|
|
|
|
numeric_columns = self.df.select_dtypes(include=[np.number]).columns |
|
categorical_columns = self.df.select_dtypes(include=['object']).columns |
|
|
|
|
|
num_imputer = SimpleImputer(strategy='mean') |
|
self.df[numeric_columns] = num_imputer.fit_transform(self.df[numeric_columns]) |
|
|
|
|
|
cat_imputer = SimpleImputer(strategy='most_frequent') |
|
self.df[categorical_columns] = cat_imputer.fit_transform(self.df[categorical_columns]) |
|
|
|
|
|
scaler = StandardScaler() |
|
self.df[numeric_columns] = scaler.fit_transform(self.df[numeric_columns]) |
|
|
|
return self.df |
|
|
|
def get_columns_with_missing_values(self): |
|
return self.df.columns[self.df.isnull().any()].tolist() |
|
|
|
def detect_outliers(self, column, method='zscore', threshold=3): |
|
if method == 'zscore': |
|
z_scores = np.abs((self.df[column] - self.df[column].mean()) / self.df[column].std()) |
|
return self.df[z_scores > threshold] |
|
elif method == 'iqr': |
|
Q1 = self.df[column].quantile(0.25) |
|
Q3 = self.df[column].quantile(0.75) |
|
IQR = Q3 - Q1 |
|
lower_bound = Q1 - 1.5 * IQR |
|
upper_bound = Q3 + 1.5 * IQR |
|
return self.df[(self.df[column] < lower_bound) | (self.df[column] > upper_bound)] |