import sys import pandas as pd import numpy as np import os from dataclasses import dataclass from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder,StandardScaler from src.exception import CustomException from src.logger import logging from src.utils import save_object @dataclass class Data_transformation_config: Preprocessor_obj_file = os.path.join("artifact","Preprocessor.pkl") class Data_transformation: def __init__(self) -> None: self.data_transformation_config = Data_transformation_config() def get_data_transformer_object(self): try: numerical_columns = ["writing_score","reading_score"] categorical_columns = [ "gender", "race_ethnicity", "parental_level_of_education", "lunch", "test_preparation_course", ] num_pipeline = Pipeline( steps = [ ("imputer",SimpleImputer(strategy="median")), ("scaler",StandardScaler()) ] ) cat_pipeline = Pipeline( steps = [ ("imputer",SimpleImputer(strategy= "most_frequent")), ("one_hot_encoder",OneHotEncoder()), ("scaler",StandardScaler(with_mean = False)) ] ) logging.info(f"Categorical Columns:{categorical_columns}") logging.info(f"Numerical Columns:{numerical_columns}") preprocessor = ColumnTransformer( [ ("num_pipeline",num_pipeline,numerical_columns), ("cat_pipeline",cat_pipeline,categorical_columns) ] ) return preprocessor except Exception as e: raise CustomException(e,sys) def initiate_data_transformation(self,train_path,test_path): try: train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) logging.info("Read train and test data completed") logging.info("Obtaining preprocessing object") preprocessor_obj = self.get_data_transformer_object() target_column_name = "math_score" numerical_columns = ["writing_score","reading_score"] input_feature_train_df = train_df.drop(columns = [target_column_name],axis = 1) target_feature_train_df = train_df[target_column_name] input_feature_test_df = test_df.drop(columns = [target_column_name],axis = 1) target_feature_test_df = test_df[target_column_name] logging.info( f"Applying preprocessing object on training dataframe and testing dataframe.") input_feature_train_arr = preprocessor_obj.fit_transform(input_feature_train_df) input_feature_test_arr = preprocessor_obj.transform(input_feature_test_df) train_arr = np.c_[input_feature_train_arr,np.array(target_feature_train_df)] test_arr = np.c_[input_feature_test_arr,np.array(target_feature_test_df)] logging.info(f"Saved preprocessing object.") save_object( file_path = self.data_transformation_config.Preprocessor_obj_file, obj = preprocessor_obj ) return ( train_arr, test_arr, self.data_transformation_config.Preprocessor_obj_file ) except Exception as e: raise CustomException(e,sys)