Ashar086 commited on
Commit
a7c36a5
·
verified ·
1 Parent(s): f158d04

Update data_processor.py

Browse files
Files changed (1) hide show
  1. data_processor.py +35 -11
data_processor.py CHANGED
@@ -1,21 +1,45 @@
1
  import pandas as pd
2
  import numpy as np
 
 
3
 
4
  class DataProcessor:
5
  def __init__(self, df):
6
  self.df = df
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def get_columns_with_missing_values(self):
9
  return self.df.columns[self.df.isnull().any()].tolist()
10
 
11
- def clean_data(self):
12
- # Remove rows with any missing values
13
- df_cleaned = self.df.dropna()
14
-
15
- # Remove duplicate rows
16
- df_cleaned = df_cleaned.drop_duplicates()
17
-
18
- # Reset the index
19
- df_cleaned = df_cleaned.reset_index(drop=True)
20
-
21
- return df_cleaned
 
1
  import pandas as pd
2
  import numpy as np
3
+ from sklearn.impute import SimpleImputer
4
+ from sklearn.preprocessing import StandardScaler
5
 
6
  class DataProcessor:
7
  def __init__(self, df):
8
  self.df = df
9
 
10
+ def clean_data(self):
11
+ # Remove duplicates
12
+ self.df = self.df.drop_duplicates()
13
+
14
+ # Handle missing values
15
+ numeric_columns = self.df.select_dtypes(include=[np.number]).columns
16
+ categorical_columns = self.df.select_dtypes(include=['object']).columns
17
+
18
+ # Impute numeric columns with mean
19
+ num_imputer = SimpleImputer(strategy='mean')
20
+ self.df[numeric_columns] = num_imputer.fit_transform(self.df[numeric_columns])
21
+
22
+ # Impute categorical columns with mode
23
+ cat_imputer = SimpleImputer(strategy='most_frequent')
24
+ self.df[categorical_columns] = cat_imputer.fit_transform(self.df[categorical_columns])
25
+
26
+ # Normalize numeric columns
27
+ scaler = StandardScaler()
28
+ self.df[numeric_columns] = scaler.fit_transform(self.df[numeric_columns])
29
+
30
+ return self.df
31
+
32
  def get_columns_with_missing_values(self):
33
  return self.df.columns[self.df.isnull().any()].tolist()
34
 
35
+ def detect_outliers(self, column, method='zscore', threshold=3):
36
+ if method == 'zscore':
37
+ z_scores = np.abs((self.df[column] - self.df[column].mean()) / self.df[column].std())
38
+ return self.df[z_scores > threshold]
39
+ elif method == 'iqr':
40
+ Q1 = self.df[column].quantile(0.25)
41
+ Q3 = self.df[column].quantile(0.75)
42
+ IQR = Q3 - Q1
43
+ lower_bound = Q1 - 1.5 * IQR
44
+ upper_bound = Q3 + 1.5 * IQR
45
+ return self.df[(self.df[column] < lower_bound) | (self.df[column] > upper_bound)]