File size: 1,742 Bytes
93fbf36 a7c36a5 93fbf36 a7c36a5 93fbf36 a7c36a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
class DataProcessor:
def __init__(self, df):
self.df = df
def clean_data(self):
# Remove duplicates
self.df = self.df.drop_duplicates()
# Handle missing values
numeric_columns = self.df.select_dtypes(include=[np.number]).columns
categorical_columns = self.df.select_dtypes(include=['object']).columns
# Impute numeric columns with mean
num_imputer = SimpleImputer(strategy='mean')
self.df[numeric_columns] = num_imputer.fit_transform(self.df[numeric_columns])
# Impute categorical columns with mode
cat_imputer = SimpleImputer(strategy='most_frequent')
self.df[categorical_columns] = cat_imputer.fit_transform(self.df[categorical_columns])
# Normalize numeric columns
scaler = StandardScaler()
self.df[numeric_columns] = scaler.fit_transform(self.df[numeric_columns])
return self.df
def get_columns_with_missing_values(self):
return self.df.columns[self.df.isnull().any()].tolist()
def detect_outliers(self, column, method='zscore', threshold=3):
if method == 'zscore':
z_scores = np.abs((self.df[column] - self.df[column].mean()) / self.df[column].std())
return self.df[z_scores > threshold]
elif method == 'iqr':
Q1 = self.df[column].quantile(0.25)
Q3 = self.df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return self.df[(self.df[column] < lower_bound) | (self.df[column] > upper_bound)] |