File size: 1,742 Bytes
93fbf36
 
a7c36a5
 
93fbf36
 
 
 
 
a7c36a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93fbf36
 
 
a7c36a5
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

class DataProcessor:
    def __init__(self, df):
        self.df = df

    def clean_data(self):
        # Remove duplicates
        self.df = self.df.drop_duplicates()

        # Handle missing values
        numeric_columns = self.df.select_dtypes(include=[np.number]).columns
        categorical_columns = self.df.select_dtypes(include=['object']).columns

        # Impute numeric columns with mean
        num_imputer = SimpleImputer(strategy='mean')
        self.df[numeric_columns] = num_imputer.fit_transform(self.df[numeric_columns])

        # Impute categorical columns with mode
        cat_imputer = SimpleImputer(strategy='most_frequent')
        self.df[categorical_columns] = cat_imputer.fit_transform(self.df[categorical_columns])

        # Normalize numeric columns
        scaler = StandardScaler()
        self.df[numeric_columns] = scaler.fit_transform(self.df[numeric_columns])

        return self.df

    def get_columns_with_missing_values(self):
        return self.df.columns[self.df.isnull().any()].tolist()

    def detect_outliers(self, column, method='zscore', threshold=3):
        if method == 'zscore':
            z_scores = np.abs((self.df[column] - self.df[column].mean()) / self.df[column].std())
            return self.df[z_scores > threshold]
        elif method == 'iqr':
            Q1 = self.df[column].quantile(0.25)
            Q3 = self.df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            return self.df[(self.df[column] < lower_bound) | (self.df[column] > upper_bound)]