{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1.1 Import Data and Required Packages\n", "Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.\n", "# Basic Import" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Basic Import\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt \n", "import seaborn as sns\n", "# Modelling\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "from sklearn.neighbors import KNeighborsRegressor\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor\n", "from sklearn.svm import SVR\n", "from sklearn.linear_model import LinearRegression, Ridge,Lasso\n", "from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n", "from sklearn.model_selection import RandomizedSearchCV\n", "from catboost import CatBoostRegressor\n", "from xgboost import XGBRegressor\n", "import warnings" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"artifact/raw.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "gender => ['female' 'male']\n", "\n", "race_ethnicity => ['group B' 'group C' 'group A' 'group D' 'group E']\n", "\n", "parental_level_of_education => [\"bachelor's degree\" 'some college' \"master's degree\" \"associate's degree\"\n", " 'high school' 'some high school']\n", "\n", "lunch => ['standard' 'free/reduced']\n", "\n", "test_preparation_course => ['none' 'completed']\n", "\n" ] } ], "source": [ "for i in df.columns:\n", " if df[i].dtype == \"object\":\n", " print(\"{} =>\".format(i),df[i].unique())\n", " print(\"\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "X = df.drop(columns=['math_score'],axis=1)\n", "y = df[\"math_score\"]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Create Column Transformer with 3 types of transformers\n", "num_features = X.select_dtypes(exclude=\"object\").columns\n", "cat_features = X.select_dtypes(include=\"object\").columns\n", "\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.compose import ColumnTransformer\n", "\n", "numeric_transformer = StandardScaler()\n", "oh_transformer = OneHotEncoder()\n", "\n", "preprocessor = ColumnTransformer(\n", " [\n", " (\"OneHotEncoder\", oh_transformer, cat_features),\n", " (\"StandardScaler\", numeric_transformer, num_features), \n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "X = preprocessor.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000, 19)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((800, 19), (200, 19))" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# separate dataset into train and test\n", "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***Create an Evaluate Function to give all metrics after model Training***" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def evaluate_model(true, predicted):\n", " mae = mean_absolute_error(true, predicted)\n", " mse = mean_squared_error(true, predicted)\n", " rmse = np.sqrt(mean_squared_error(true, predicted))\n", " r2_square = r2_score(true, predicted)\n", " return mae, rmse, r2_square" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Linear Regression\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.3243\n", "- Mean Absolute Error: 4.2671\n", "- R2 Score: 0.8743\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.3960\n", "- Mean Absolute Error: 4.2158\n", "- R2 Score: 0.8803\n", "===================================\n", "\n", "\n", "Lasso\n", "Model performance for Training set\n", "- Root Mean Squared Error: 6.5938\n", "- Mean Absolute Error: 5.2063\n", "- R2 Score: 0.8071\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.5197\n", "- Mean Absolute Error: 5.1579\n", "- R2 Score: 0.8253\n", "===================================\n", "\n", "\n", "Ridge\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.3233\n", "- Mean Absolute Error: 4.2650\n", "- R2 Score: 0.8743\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.3904\n", "- Mean Absolute Error: 4.2111\n", "- R2 Score: 0.8806\n", "===================================\n", "\n", "\n", "K-Neighbors Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.7077\n", "- Mean Absolute Error: 4.5167\n", "- R2 Score: 0.8555\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 7.2530\n", "- Mean Absolute Error: 5.6210\n", "- R2 Score: 0.7838\n", "===================================\n", "\n", "\n", "Decision Tree\n", "Model performance for Training set\n", "- Root Mean Squared Error: 0.2795\n", "- Mean Absolute Error: 0.0187\n", "- R2 Score: 0.9997\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 7.7785\n", "- Mean Absolute Error: 6.2350\n", "- R2 Score: 0.7514\n", "===================================\n", "\n", "\n", "Random Forest Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 2.2860\n", "- Mean Absolute Error: 1.8215\n", "- R2 Score: 0.9768\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.9993\n", "- Mean Absolute Error: 4.6304\n", "- R2 Score: 0.8521\n", "===================================\n", "\n", "\n", "XGBRegressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 1.0073\n", "- Mean Absolute Error: 0.6875\n", "- R2 Score: 0.9955\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.4733\n", "- Mean Absolute Error: 5.0577\n", "- R2 Score: 0.8278\n", "===================================\n", "\n", "\n", "CatBoosting Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 3.0427\n", "- Mean Absolute Error: 2.4054\n", "- R2 Score: 0.9589\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 6.0086\n", "- Mean Absolute Error: 4.6125\n", "- R2 Score: 0.8516\n", "===================================\n", "\n", "\n", "AdaBoost Regressor\n", "Model performance for Training set\n", "- Root Mean Squared Error: 5.7923\n", "- Mean Absolute Error: 4.7185\n", "- R2 Score: 0.8512\n", "----------------------------------\n", "Model performance for Test set\n", "- Root Mean Squared Error: 5.9460\n", "- Mean Absolute Error: 4.6538\n", "- R2 Score: 0.8547\n", "===================================\n", "\n", "\n" ] } ], "source": [ "models = {\n", " \"Linear Regression\": LinearRegression(),\n", " \"Lasso\": Lasso(),\n", " \"Ridge\": Ridge(),\n", " \"K-Neighbors Regressor\": KNeighborsRegressor(),\n", " \"Decision Tree\": DecisionTreeRegressor(),\n", " \"Random Forest Regressor\": RandomForestRegressor(),\n", " \"XGBRegressor\": XGBRegressor(), \n", " \"CatBoosting Regressor\": CatBoostRegressor(verbose=False),\n", " \"AdaBoost Regressor\": AdaBoostRegressor()\n", "}\n", "model_list = []\n", "r2_list =[]\n", "\n", "for i in range(len(list(models))):\n", " model = list(models.values())[i]\n", " model.fit(X_train, y_train) # Train model\n", "\n", " # Make predictions\n", " y_train_pred = model.predict(X_train)\n", " y_test_pred = model.predict(X_test)\n", " \n", " # Evaluate Train and Test dataset\n", " model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)\n", "\n", " model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)\n", "\n", " \n", " print(list(models.keys())[i])\n", " model_list.append(list(models.keys())[i])\n", " \n", " print('Model performance for Training set')\n", " print(\"- Root Mean Squared Error: {:.4f}\".format(model_train_rmse))\n", " print(\"- Mean Absolute Error: {:.4f}\".format(model_train_mae))\n", " print(\"- R2 Score: {:.4f}\".format(model_train_r2))\n", "\n", " print('----------------------------------')\n", " \n", " print('Model performance for Test set')\n", " print(\"- Root Mean Squared Error: {:.4f}\".format(model_test_rmse))\n", " print(\"- Mean Absolute Error: {:.4f}\".format(model_test_mae))\n", " print(\"- R2 Score: {:.4f}\".format(model_test_r2))\n", " r2_list.append(model_test_r2)\n", " \n", " print('='*35)\n", " print('\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***Results***" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Model Name | \n", "R2_Score | \n", "
---|---|---|
2 | \n", "Ridge | \n", "0.880593 | \n", "
0 | \n", "Linear Regression | \n", "0.880345 | \n", "
8 | \n", "AdaBoost Regressor | \n", "0.854710 | \n", "
5 | \n", "Random Forest Regressor | \n", "0.852094 | \n", "
7 | \n", "CatBoosting Regressor | \n", "0.851632 | \n", "
6 | \n", "XGBRegressor | \n", "0.827797 | \n", "
1 | \n", "Lasso | \n", "0.825320 | \n", "
3 | \n", "K-Neighbors Regressor | \n", "0.783813 | \n", "
4 | \n", "Decision Tree | \n", "0.751354 | \n", "