hackathon / data_cleaning.py
Ashar086's picture
Create data_cleaning.py
1573ecb verified
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
class DataCleaner:
def __init__(self):
self.imputer = SimpleImputer(strategy='mean')
self.scaler = StandardScaler()
def clean(self, data):
# Handle missing values
data = self.handle_missing_values(data)
# Remove outliers
data = self.remove_outliers(data)
# Normalize data
data = self.normalize_data(data)
return data
def handle_missing_values(self, data):
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = self.imputer.fit_transform(data[numeric_columns])
return data
def remove_outliers(self, data):
numeric_columns = data.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
Q1 = data[column].quantile(0.25)
Q3 = data[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
return data
def normalize_data(self, data):
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = self.scaler.fit_transform(data[numeric_columns])
return data