{ "cells": [ { "cell_type": "markdown", "source": [ "## Libraries" ], "metadata": { "id": "8Vjfbt-sDp13" }, "id": "8Vjfbt-sDp13" }, { "cell_type": "code", "source": [ "!pip install shap" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Gxk7Ljc_AEgq", "outputId": "5ebe547a-6ce8-4303-a92b-750460e213bf" }, "id": "Gxk7Ljc_AEgq", "execution_count": 69, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: shap in /usr/local/lib/python3.10/dist-packages (0.42.1)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from shap) (1.22.4)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from shap) (1.10.1)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from shap) (1.2.2)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from shap) (1.5.3)\n", "Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from shap) (4.65.0)\n", "Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.10/dist-packages (from shap) (23.1)\n", "Requirement already satisfied: slicer==0.0.7 in /usr/local/lib/python3.10/dist-packages (from shap) (0.0.7)\n", "Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from shap) (0.56.4)\n", "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from shap) (2.2.1)\n", "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->shap) (0.39.1)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from numba->shap) (67.7.2)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap) (2022.7.1)\n", "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap) (1.3.1)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap) (3.2.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->shap) (1.16.0)\n" ] } ] }, { "cell_type": "code", "source": [ "pip install sklearn" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tFbt2zHi4buJ", "outputId": "876a767d-e4f9-41b3-fa9c-8ee60d63586e" }, "id": "tFbt2zHi4buJ", "execution_count": 70, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting sklearn\n", " Using cached sklearn-0.0.post7.tar.gz (3.6 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Building wheels for collected packages: sklearn\n", " Building wheel for sklearn (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for sklearn: filename=sklearn-0.0.post7-py3-none-any.whl size=2952 sha256=760c4c51afa8676f9b309a9c35e380db9ae8fe6162a7eddf9075ce800939f557\n", " Stored in directory: /root/.cache/pip/wheels/c8/9c/85/72901eb50bc4bc6e3b2629378d172384ea3dfd19759c77fd2c\n", "Successfully built sklearn\n", "Installing collected packages: sklearn\n", "Successfully installed sklearn-0.0.post7\n" ] } ] }, { "cell_type": "code", "source": [ "pip install optuna" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tZjaFtQeIToL", "outputId": "f2afef8b-d601-4212-c6a8-3b8d727b6933" }, "id": "tZjaFtQeIToL", "execution_count": 71, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: optuna in /usr/local/lib/python3.10/dist-packages (3.2.0)\n", "Requirement already satisfied: alembic>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (1.11.1)\n", "Requirement already satisfied: cmaes>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from optuna) (0.10.0)\n", "Requirement already satisfied: colorlog in /usr/local/lib/python3.10/dist-packages (from optuna) (6.7.0)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from optuna) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (23.1)\n", "Requirement already satisfied: sqlalchemy>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from optuna) (2.0.19)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from optuna) (4.65.0)\n", "Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from optuna) (6.0.1)\n", "Requirement already satisfied: Mako in /usr/local/lib/python3.10/dist-packages (from alembic>=1.5.0->optuna) (1.2.4)\n", "Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.10/dist-packages (from alembic>=1.5.0->optuna) (4.7.1)\n", "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.3.0->optuna) (2.0.2)\n", "Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.10/dist-packages (from Mako->alembic>=1.5.0->optuna) (2.1.3)\n" ] } ] }, { "cell_type": "code", "execution_count": 131, "id": "c5e31cf2", "metadata": { "id": "c5e31cf2" }, "outputs": [], "source": [ "# import libraries\n", "import math\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import graphviz\n", "import xgboost as xgb\n", "import shap\n", "from math import sqrt\n", "from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error\n", "from sklearn.model_selection import train_test_split, KFold\n", "\n", "%matplotlib inline\n", "import lightgbm as lgbm\n", "from lightgbm import log_evaluation, early_stopping\n", "import optuna\n", "from optuna.integration import LightGBMPruningCallback\n", "\n", "from pickle import load" ] }, { "cell_type": "markdown", "id": "ce7d0a75", "metadata": { "id": "ce7d0a75" }, "source": [ "## Data Processing and Feature Selection\n", "\n", "For the feature selection, I started off with dropping columns that have low correlation (< 0.4) with SalePrice. I then dropped columns with low variances (< 1). After that I checked the correlation matrix between columns to dropped selected columns that have correlation greater than 0.5 but with consideration for domain knowledge. After that I checked for NAs in the numerical columns. Then, based on the result, I used domain knowledge to fill the NAs with appropriate value. In this case, I used 0 to fill the NAs as it was the most relevant value. As for the categorical NAs, they were replaced with ‘None’. Once, all the NAs were taken cared of, I used LabelEncoder to encode the categorical values. I, then, checked for correlation between columns and dropped them based on domain knowledge." ] }, { "cell_type": "markdown", "source": [ "link to the data: https://drive.google.com/drive/folders/1oml9pTxlzrMBt7qZRe2KSV8dkNkbEXvK?usp=sharing" ], "metadata": { "id": "Ku3MSqwIF58K" }, "id": "Ku3MSqwIF58K" }, { "cell_type": "markdown", "id": "74abfbd7", "metadata": { "id": "74abfbd7" }, "source": [ "#### Importing Data" ] }, { "cell_type": "code", "execution_count": 73, "id": "e13fb5d4", "metadata": { "id": "e13fb5d4" }, "outputs": [], "source": [ "dataset = pd.read_csv('train.csv')\n", "testset = pd.read_csv('test.csv')\n", "test_results = pd.read_csv('sample_submission.csv')" ] }, { "cell_type": "markdown", "id": "f5e94266", "metadata": { "id": "f5e94266" }, "source": [ "#### Examining train dataset" ] }, { "cell_type": "code", "execution_count": 74, "id": "d916ab5d", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d916ab5d", "outputId": "0458cc8d-730b-44f3-f6b0-3fa7f724fc73" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 1460 entries, 0 to 1459\n", "Data columns (total 81 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Id 1460 non-null int64 \n", " 1 MSSubClass 1460 non-null int64 \n", " 2 MSZoning 1460 non-null object \n", " 3 LotFrontage 1201 non-null float64\n", " 4 LotArea 1460 non-null int64 \n", " 5 Street 1460 non-null object \n", " 6 Alley 91 non-null object \n", " 7 LotShape 1460 non-null object \n", " 8 LandContour 1460 non-null object \n", " 9 Utilities 1460 non-null object \n", " 10 LotConfig 1460 non-null object \n", " 11 LandSlope 1460 non-null object \n", " 12 Neighborhood 1460 non-null object \n", " 13 Condition1 1460 non-null object \n", " 14 Condition2 1460 non-null object \n", " 15 BldgType 1460 non-null object \n", " 16 HouseStyle 1460 non-null object \n", " 17 OverallQual 1460 non-null int64 \n", " 18 OverallCond 1460 non-null int64 \n", " 19 YearBuilt 1460 non-null int64 \n", " 20 YearRemodAdd 1460 non-null int64 \n", " 21 RoofStyle 1460 non-null object \n", " 22 RoofMatl 1460 non-null object \n", " 23 Exterior1st 1460 non-null object \n", " 24 Exterior2nd 1460 non-null object \n", " 25 MasVnrType 1452 non-null object \n", " 26 MasVnrArea 1452 non-null float64\n", " 27 ExterQual 1460 non-null object \n", " 28 ExterCond 1460 non-null object \n", " 29 Foundation 1460 non-null object \n", " 30 BsmtQual 1423 non-null object \n", " 31 BsmtCond 1423 non-null object \n", " 32 BsmtExposure 1422 non-null object \n", " 33 BsmtFinType1 1423 non-null object \n", " 34 BsmtFinSF1 1460 non-null int64 \n", " 35 BsmtFinType2 1422 non-null object \n", " 36 BsmtFinSF2 1460 non-null int64 \n", " 37 BsmtUnfSF 1460 non-null int64 \n", " 38 TotalBsmtSF 1460 non-null int64 \n", " 39 Heating 1460 non-null object \n", " 40 HeatingQC 1460 non-null object \n", " 41 CentralAir 1460 non-null object \n", " 42 Electrical 1459 non-null object \n", " 43 1stFlrSF 1460 non-null int64 \n", " 44 2ndFlrSF 1460 non-null int64 \n", " 45 LowQualFinSF 1460 non-null int64 \n", " 46 GrLivArea 1460 non-null int64 \n", " 47 BsmtFullBath 1460 non-null int64 \n", " 48 BsmtHalfBath 1460 non-null int64 \n", " 49 FullBath 1460 non-null int64 \n", " 50 HalfBath 1460 non-null int64 \n", " 51 BedroomAbvGr 1460 non-null int64 \n", " 52 KitchenAbvGr 1460 non-null int64 \n", " 53 KitchenQual 1460 non-null object \n", " 54 TotRmsAbvGrd 1460 non-null int64 \n", " 55 Functional 1460 non-null object \n", " 56 Fireplaces 1460 non-null int64 \n", " 57 FireplaceQu 770 non-null object \n", " 58 GarageType 1379 non-null object \n", " 59 GarageYrBlt 1379 non-null float64\n", " 60 GarageFinish 1379 non-null object \n", " 61 GarageCars 1460 non-null int64 \n", " 62 GarageArea 1460 non-null int64 \n", " 63 GarageQual 1379 non-null object \n", " 64 GarageCond 1379 non-null object \n", " 65 PavedDrive 1460 non-null object \n", " 66 WoodDeckSF 1460 non-null int64 \n", " 67 OpenPorchSF 1460 non-null int64 \n", " 68 EnclosedPorch 1460 non-null int64 \n", " 69 3SsnPorch 1460 non-null int64 \n", " 70 ScreenPorch 1460 non-null int64 \n", " 71 PoolArea 1460 non-null int64 \n", " 72 PoolQC 7 non-null object \n", " 73 Fence 281 non-null object \n", " 74 MiscFeature 54 non-null object \n", " 75 MiscVal 1460 non-null int64 \n", " 76 MoSold 1460 non-null int64 \n", " 77 YrSold 1460 non-null int64 \n", " 78 SaleType 1460 non-null object \n", " 79 SaleCondition 1460 non-null object \n", " 80 SalePrice 1460 non-null int64 \n", "dtypes: float64(3), int64(35), object(43)\n", "memory usage: 924.0+ KB\n" ] } ], "source": [ "dataset.info()" ] }, { "cell_type": "code", "source": [ "testset.info()" ], "metadata": { "id": "Au55nOoS9ZfB", "outputId": "890ae45c-6b7f-414d-c575-1773952eadc2", "colab": { "base_uri": "https://localhost:8080/" } }, "id": "Au55nOoS9ZfB", "execution_count": 75, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 1459 entries, 0 to 1458\n", "Data columns (total 80 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Id 1459 non-null int64 \n", " 1 MSSubClass 1459 non-null int64 \n", " 2 MSZoning 1455 non-null object \n", " 3 LotFrontage 1232 non-null float64\n", " 4 LotArea 1459 non-null int64 \n", " 5 Street 1459 non-null object \n", " 6 Alley 107 non-null object \n", " 7 LotShape 1459 non-null object \n", " 8 LandContour 1459 non-null object \n", " 9 Utilities 1457 non-null object \n", " 10 LotConfig 1459 non-null object \n", " 11 LandSlope 1459 non-null object \n", " 12 Neighborhood 1459 non-null object \n", " 13 Condition1 1459 non-null object \n", " 14 Condition2 1459 non-null object \n", " 15 BldgType 1459 non-null object \n", " 16 HouseStyle 1459 non-null object \n", " 17 OverallQual 1459 non-null int64 \n", " 18 OverallCond 1459 non-null int64 \n", " 19 YearBuilt 1459 non-null int64 \n", " 20 YearRemodAdd 1459 non-null int64 \n", " 21 RoofStyle 1459 non-null object \n", " 22 RoofMatl 1459 non-null object \n", " 23 Exterior1st 1458 non-null object \n", " 24 Exterior2nd 1458 non-null object \n", " 25 MasVnrType 1443 non-null object \n", " 26 MasVnrArea 1444 non-null float64\n", " 27 ExterQual 1459 non-null object \n", " 28 ExterCond 1459 non-null object \n", " 29 Foundation 1459 non-null object \n", " 30 BsmtQual 1415 non-null object \n", " 31 BsmtCond 1414 non-null object \n", " 32 BsmtExposure 1415 non-null object \n", " 33 BsmtFinType1 1417 non-null object \n", " 34 BsmtFinSF1 1458 non-null float64\n", " 35 BsmtFinType2 1417 non-null object \n", " 36 BsmtFinSF2 1458 non-null float64\n", " 37 BsmtUnfSF 1458 non-null float64\n", " 38 TotalBsmtSF 1458 non-null float64\n", " 39 Heating 1459 non-null object \n", " 40 HeatingQC 1459 non-null object \n", " 41 CentralAir 1459 non-null object \n", " 42 Electrical 1459 non-null object \n", " 43 1stFlrSF 1459 non-null int64 \n", " 44 2ndFlrSF 1459 non-null int64 \n", " 45 LowQualFinSF 1459 non-null int64 \n", " 46 GrLivArea 1459 non-null int64 \n", " 47 BsmtFullBath 1457 non-null float64\n", " 48 BsmtHalfBath 1457 non-null float64\n", " 49 FullBath 1459 non-null int64 \n", " 50 HalfBath 1459 non-null int64 \n", " 51 BedroomAbvGr 1459 non-null int64 \n", " 52 KitchenAbvGr 1459 non-null int64 \n", " 53 KitchenQual 1458 non-null object \n", " 54 TotRmsAbvGrd 1459 non-null int64 \n", " 55 Functional 1457 non-null object \n", " 56 Fireplaces 1459 non-null int64 \n", " 57 FireplaceQu 729 non-null object \n", " 58 GarageType 1383 non-null object \n", " 59 GarageYrBlt 1381 non-null float64\n", " 60 GarageFinish 1381 non-null object \n", " 61 GarageCars 1458 non-null float64\n", " 62 GarageArea 1458 non-null float64\n", " 63 GarageQual 1381 non-null object \n", " 64 GarageCond 1381 non-null object \n", " 65 PavedDrive 1459 non-null object \n", " 66 WoodDeckSF 1459 non-null int64 \n", " 67 OpenPorchSF 1459 non-null int64 \n", " 68 EnclosedPorch 1459 non-null int64 \n", " 69 3SsnPorch 1459 non-null int64 \n", " 70 ScreenPorch 1459 non-null int64 \n", " 71 PoolArea 1459 non-null int64 \n", " 72 PoolQC 3 non-null object \n", " 73 Fence 290 non-null object \n", " 74 MiscFeature 51 non-null object \n", " 75 MiscVal 1459 non-null int64 \n", " 76 MoSold 1459 non-null int64 \n", " 77 YrSold 1459 non-null int64 \n", " 78 SaleType 1458 non-null object \n", " 79 SaleCondition 1459 non-null object \n", "dtypes: float64(11), int64(26), object(43)\n", "memory usage: 912.0+ KB\n" ] } ] }, { "cell_type": "markdown", "id": "43ab061c", "metadata": { "id": "43ab061c" }, "source": [ "#### Setting y to the label column (numpy array)" ] }, { "cell_type": "code", "execution_count": 76, "id": "ac8eb354", "metadata": { "id": "ac8eb354" }, "outputs": [], "source": [ "y = dataset['SalePrice'].values\n", "#type(y)" ] }, { "cell_type": "code", "source": [ "y_test = test_results['SalePrice'].values" ], "metadata": { "id": "IjK-_p4_9P5u" }, "id": "IjK-_p4_9P5u", "execution_count": 77, "outputs": [] }, { "cell_type": "markdown", "id": "d1f6fcaa", "metadata": { "id": "d1f6fcaa" }, "source": [ "#### Making a new dataframe without SalePrice" ] }, { "cell_type": "code", "execution_count": 78, "id": "5bba9f18", "metadata": { "id": "5bba9f18", "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "outputId": "fa125817-7212-4b6e-d006-c6c69fb29594" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "0 1 60 RL 65.0 8450 Pave NaN Reg \n", "1 2 20 RL 80.0 9600 Pave NaN Reg \n", "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", "\n", " LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature \\\n", "0 Lvl AllPub ... 0 0 NaN NaN NaN \n", "1 Lvl AllPub ... 0 0 NaN NaN NaN \n", "2 Lvl AllPub ... 0 0 NaN NaN NaN \n", "3 Lvl AllPub ... 0 0 NaN NaN NaN \n", "4 Lvl AllPub ... 0 0 NaN NaN NaN \n", "\n", " MiscVal MoSold YrSold SaleType SaleCondition \n", "0 0 2 2008 WD Normal \n", "1 0 5 2007 WD Normal \n", "2 0 9 2008 WD Normal \n", "3 0 2 2006 WD Abnorml \n", "4 0 12 2008 WD Normal \n", "\n", "[5 rows x 80 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...ScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleCondition
0160RL65.08450PaveNaNRegLvlAllPub...00NaNNaNNaN022008WDNormal
1220RL80.09600PaveNaNRegLvlAllPub...00NaNNaNNaN052007WDNormal
2360RL68.011250PaveNaNIR1LvlAllPub...00NaNNaNNaN092008WDNormal
3470RL60.09550PaveNaNIR1LvlAllPub...00NaNNaNNaN022006WDAbnorml
4560RL84.014260PaveNaNIR1LvlAllPub...00NaNNaNNaN0122008WDNormal
\n", "

5 rows × 80 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 78 } ], "source": [ "X_start = dataset.drop(['SalePrice'], axis = 1)\n", "X_start.head()" ] }, { "cell_type": "markdown", "id": "0e0e3e2d", "metadata": { "id": "0e0e3e2d" }, "source": [ "#### Checking for columns with low correlation (< 0.4) with SalePrice and dropping them" ] }, { "cell_type": "code", "execution_count": 79, "id": "213d8d98", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "213d8d98", "outputId": "eb7c6818-2815-4784-95ba-5b3b6166b2fa" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n" ] } ], "source": [ "price_corr = dataset.corr()['SalePrice']" ] }, { "cell_type": "code", "execution_count": 80, "id": "dd70b06c", "metadata": { "id": "dd70b06c" }, "outputs": [], "source": [ "low_corr = price_corr[abs(price_corr) < 0.4].sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 81, "id": "e027ed66", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "e027ed66", "outputId": "43662ed2-24ff-4829-9358-b375dd9d5d81" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "BsmtFinSF1 0.386420\n", "LotFrontage 0.351799\n", "WoodDeckSF 0.324413\n", "2ndFlrSF 0.319334\n", "OpenPorchSF 0.315856\n", "HalfBath 0.284108\n", "LotArea 0.263843\n", "BsmtFullBath 0.227122\n", "BsmtUnfSF 0.214479\n", "BedroomAbvGr 0.168213\n", "ScreenPorch 0.111447\n", "PoolArea 0.092404\n", "MoSold 0.046432\n", "3SsnPorch 0.044584\n", "BsmtFinSF2 -0.011378\n", "BsmtHalfBath -0.016844\n", "MiscVal -0.021190\n", "Id -0.021917\n", "LowQualFinSF -0.025606\n", "YrSold -0.028923\n", "OverallCond -0.077856\n", "MSSubClass -0.084284\n", "EnclosedPorch -0.128578\n", "KitchenAbvGr -0.135907\n", "Name: SalePrice, dtype: float64" ] }, "metadata": {}, "execution_count": 81 } ], "source": [ "low_corr" ] }, { "cell_type": "code", "execution_count": 82, "id": "978aa742", "metadata": { "scrolled": true, "id": "978aa742" }, "outputs": [], "source": [ "for i in low_corr.index:\n", " X_start.drop(i, axis = 1, inplace = True)\n", " testset.drop(i, axis = 1, inplace = True)" ] }, { "cell_type": "code", "execution_count": 83, "id": "568174fb", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "568174fb", "outputId": "6310a184-710c-4de3-89ae-5c044a53662a" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope \\\n", "0 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1 RL Pave NaN Reg Lvl AllPub FR2 Gtl \n", "2 RL Pave NaN IR1 Lvl AllPub Inside Gtl \n", "3 RL Pave NaN IR1 Lvl AllPub Corner Gtl \n", "4 RL Pave NaN IR1 Lvl AllPub FR2 Gtl \n", "\n", " Neighborhood Condition1 ... GarageCars GarageArea GarageQual GarageCond \\\n", "0 CollgCr Norm ... 2 548 TA TA \n", "1 Veenker Feedr ... 2 460 TA TA \n", "2 CollgCr Norm ... 2 608 TA TA \n", "3 Crawfor Norm ... 3 642 TA TA \n", "4 NoRidge Norm ... 3 836 TA TA \n", "\n", " PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition \n", "0 Y NaN NaN NaN WD Normal \n", "1 Y NaN NaN NaN WD Normal \n", "2 Y NaN NaN NaN WD Normal \n", "3 Y NaN NaN NaN WD Abnorml \n", "4 Y NaN NaN NaN WD Normal \n", "\n", "[5 rows x 56 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...GarageCarsGarageAreaGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
0RLPaveNaNRegLvlAllPubInsideGtlCollgCrNorm...2548TATAYNaNNaNNaNWDNormal
1RLPaveNaNRegLvlAllPubFR2GtlVeenkerFeedr...2460TATAYNaNNaNNaNWDNormal
2RLPaveNaNIR1LvlAllPubInsideGtlCollgCrNorm...2608TATAYNaNNaNNaNWDNormal
3RLPaveNaNIR1LvlAllPubCornerGtlCrawforNorm...3642TATAYNaNNaNNaNWDAbnorml
4RLPaveNaNIR1LvlAllPubFR2GtlNoRidgeNorm...3836TATAYNaNNaNNaNWDNormal
\n", "

5 rows × 56 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 83 } ], "source": [ "X_start.head()" ] }, { "cell_type": "markdown", "id": "42da68a9", "metadata": { "id": "42da68a9" }, "source": [ "#### Checking for columns with low variance (< 1) and dropping them" ] }, { "cell_type": "code", "execution_count": 84, "id": "e761e84e", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "e761e84e", "outputId": "812688cc-a24c-4805-85f2-407228a7e9d6" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "The default value of numeric_only in DataFrame.var is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.\n" ] } ], "source": [ "variance = X_start.var()" ] }, { "cell_type": "code", "execution_count": 85, "id": "64855097", "metadata": { "id": "64855097" }, "outputs": [], "source": [ "low_var = variance[(variance) < 1].sort_values(ascending = True)" ] }, { "cell_type": "code", "execution_count": 86, "id": "32be86a0", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/" }, "id": "32be86a0", "outputId": "00f1302b-f12e-4a66-c1f4-a4267eae2645" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "FullBath 0.303508\n", "Fireplaces 0.415595\n", "GarageCars 0.558480\n", "dtype: float64" ] }, "metadata": {}, "execution_count": 86 } ], "source": [ "low_var" ] }, { "cell_type": "code", "execution_count": 87, "id": "28340bfa", "metadata": { "id": "28340bfa" }, "outputs": [], "source": [ "for i in low_var.index:\n", " X_start.drop(i, axis = 1, inplace = True)\n", " testset.drop(i, axis = 1, inplace = True)" ] }, { "cell_type": "code", "execution_count": 88, "id": "e79a1ccd", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "e79a1ccd", "outputId": "995023de-9cd5-4815-a219-b7903073cb9e" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope \\\n", "0 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1 RL Pave NaN Reg Lvl AllPub FR2 Gtl \n", "2 RL Pave NaN IR1 Lvl AllPub Inside Gtl \n", "3 RL Pave NaN IR1 Lvl AllPub Corner Gtl \n", "4 RL Pave NaN IR1 Lvl AllPub FR2 Gtl \n", "\n", " Neighborhood Condition1 ... GarageFinish GarageArea GarageQual GarageCond \\\n", "0 CollgCr Norm ... RFn 548 TA TA \n", "1 Veenker Feedr ... RFn 460 TA TA \n", "2 CollgCr Norm ... RFn 608 TA TA \n", "3 Crawfor Norm ... Unf 642 TA TA \n", "4 NoRidge Norm ... RFn 836 TA TA \n", "\n", " PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition \n", "0 Y NaN NaN NaN WD Normal \n", "1 Y NaN NaN NaN WD Normal \n", "2 Y NaN NaN NaN WD Normal \n", "3 Y NaN NaN NaN WD Abnorml \n", "4 Y NaN NaN NaN WD Normal \n", "\n", "[5 rows x 53 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...GarageFinishGarageAreaGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
0RLPaveNaNRegLvlAllPubInsideGtlCollgCrNorm...RFn548TATAYNaNNaNNaNWDNormal
1RLPaveNaNRegLvlAllPubFR2GtlVeenkerFeedr...RFn460TATAYNaNNaNNaNWDNormal
2RLPaveNaNIR1LvlAllPubInsideGtlCollgCrNorm...RFn608TATAYNaNNaNNaNWDNormal
3RLPaveNaNIR1LvlAllPubCornerGtlCrawforNorm...Unf642TATAYNaNNaNNaNWDAbnorml
4RLPaveNaNIR1LvlAllPubFR2GtlNoRidgeNorm...RFn836TATAYNaNNaNNaNWDNormal
\n", "

5 rows × 53 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 88 } ], "source": [ "X_start.head()" ] }, { "cell_type": "markdown", "id": "4d3cd6a1", "metadata": { "id": "4d3cd6a1" }, "source": [ "#### Checking to correlation between columns and dropping selected columns based on domain knowledge" ] }, { "cell_type": "code", "execution_count": 89, "id": "9be646b3", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9be646b3", "outputId": "8239eb19-ae33-44be-d669-3fc4212820b4" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.\n" ] } ], "source": [ "correlation = X_start.corr().abs()\n", "corr_list = (correlation.where(np.triu(np.ones(correlation.shape), k=1).astype(bool))\n", " .stack())\n", "high_corr = corr_list.loc[corr_list > 0.5]" ] }, { "cell_type": "code", "execution_count": 90, "id": "7aa53645", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7aa53645", "outputId": "fcb2d06c-275f-4efe-e17f-353d806698c3" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "OverallQual YearBuilt 0.572323\n", " YearRemodAdd 0.550684\n", " TotalBsmtSF 0.537808\n", " GrLivArea 0.593007\n", " GarageYrBlt 0.547766\n", " GarageArea 0.562022\n", "YearBuilt YearRemodAdd 0.592855\n", " GarageYrBlt 0.825667\n", "YearRemodAdd GarageYrBlt 0.642277\n", "TotalBsmtSF 1stFlrSF 0.819530\n", "1stFlrSF GrLivArea 0.566024\n", "GrLivArea TotRmsAbvGrd 0.825489\n", "GarageYrBlt GarageArea 0.564567\n", "dtype: float64" ] }, "metadata": {}, "execution_count": 90 } ], "source": [ "high_corr" ] }, { "cell_type": "code", "execution_count": 91, "id": "2d30f2f6", "metadata": { "id": "2d30f2f6" }, "outputs": [], "source": [ "drop_hico = ['GarageArea', 'TotRmsAbvGrd', '1stFlrSF', 'GarageYrBlt', 'YearRemodAdd']" ] }, { "cell_type": "code", "execution_count": 92, "id": "1c29f6db", "metadata": { "id": "1c29f6db" }, "outputs": [], "source": [ "X_start = X_start.drop(drop_hico, axis = 1)\n", "testset = testset.drop(drop_hico, axis = 1)" ] }, { "cell_type": "code", "execution_count": 93, "id": "46e4fdc1", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "46e4fdc1", "outputId": "410292ce-51f6-4fdd-9959-683362bc75ca" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope \\\n", "0 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1 RL Pave NaN Reg Lvl AllPub FR2 Gtl \n", "2 RL Pave NaN IR1 Lvl AllPub Inside Gtl \n", "3 RL Pave NaN IR1 Lvl AllPub Corner Gtl \n", "4 RL Pave NaN IR1 Lvl AllPub FR2 Gtl \n", "\n", " Neighborhood Condition1 ... GarageType GarageFinish GarageQual GarageCond \\\n", "0 CollgCr Norm ... Attchd RFn TA TA \n", "1 Veenker Feedr ... Attchd RFn TA TA \n", "2 CollgCr Norm ... Attchd RFn TA TA \n", "3 Crawfor Norm ... Detchd Unf TA TA \n", "4 NoRidge Norm ... Attchd RFn TA TA \n", "\n", " PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition \n", "0 Y NaN NaN NaN WD Normal \n", "1 Y NaN NaN NaN WD Normal \n", "2 Y NaN NaN NaN WD Normal \n", "3 Y NaN NaN NaN WD Abnorml \n", "4 Y NaN NaN NaN WD Normal \n", "\n", "[5 rows x 48 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...GarageTypeGarageFinishGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
0RLPaveNaNRegLvlAllPubInsideGtlCollgCrNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
1RLPaveNaNRegLvlAllPubFR2GtlVeenkerFeedr...AttchdRFnTATAYNaNNaNNaNWDNormal
2RLPaveNaNIR1LvlAllPubInsideGtlCollgCrNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
3RLPaveNaNIR1LvlAllPubCornerGtlCrawforNorm...DetchdUnfTATAYNaNNaNNaNWDAbnorml
4RLPaveNaNIR1LvlAllPubFR2GtlNoRidgeNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
\n", "

5 rows × 48 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 93 } ], "source": [ "X_start.head()" ] }, { "cell_type": "markdown", "id": "f1827825", "metadata": { "id": "f1827825" }, "source": [ "#### Identifiying numerical and categorical values for replacing NAs with appropriate values" ] }, { "cell_type": "code", "execution_count": 94, "id": "6fe2d4e7", "metadata": { "id": "6fe2d4e7" }, "outputs": [], "source": [ "numerical = X_start.select_dtypes(include=['number'])\n", "categorical = X_start.select_dtypes(include=['object'])\n", "t_numerical = testset.select_dtypes(include=['number'])\n", "t_categorical = testset.select_dtypes(include=['object'])" ] }, { "cell_type": "code", "execution_count": 95, "id": "6ab315bc", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "6ab315bc", "outputId": "9654ae99-8ed3-438c-d71a-7fd65638ffc3" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " OverallQual YearBuilt MasVnrArea TotalBsmtSF GrLivArea\n", "0 7 2003 196.0 856 1710\n", "1 6 1976 0.0 1262 1262\n", "2 7 2001 162.0 920 1786\n", "3 7 1915 0.0 756 1717\n", "4 8 2000 350.0 1145 2198" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OverallQualYearBuiltMasVnrAreaTotalBsmtSFGrLivArea
072003196.08561710
1619760.012621262
272001162.09201786
3719150.07561717
482000350.011452198
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 95 } ], "source": [ "numerical.head()" ] }, { "cell_type": "code", "source": [ "categorical" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "SEZM6dlIcePb", "outputId": "3793b45d-b504-42ff-d14f-739d40a94707" }, "id": "SEZM6dlIcePb", "execution_count": 96, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope \\\n", "0 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1 RL Pave NaN Reg Lvl AllPub FR2 Gtl \n", "2 RL Pave NaN IR1 Lvl AllPub Inside Gtl \n", "3 RL Pave NaN IR1 Lvl AllPub Corner Gtl \n", "4 RL Pave NaN IR1 Lvl AllPub FR2 Gtl \n", "... ... ... ... ... ... ... ... ... \n", "1455 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1456 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1457 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1458 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1459 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "\n", " Neighborhood Condition1 ... GarageType GarageFinish GarageQual \\\n", "0 CollgCr Norm ... Attchd RFn TA \n", "1 Veenker Feedr ... Attchd RFn TA \n", "2 CollgCr Norm ... Attchd RFn TA \n", "3 Crawfor Norm ... Detchd Unf TA \n", "4 NoRidge Norm ... Attchd RFn TA \n", "... ... ... ... ... ... ... \n", "1455 Gilbert Norm ... Attchd RFn TA \n", "1456 NWAmes Norm ... Attchd Unf TA \n", "1457 Crawfor Norm ... Attchd RFn TA \n", "1458 NAmes Norm ... Attchd Unf TA \n", "1459 Edwards Norm ... Attchd Fin TA \n", "\n", " GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition \n", "0 TA Y NaN NaN NaN WD Normal \n", "1 TA Y NaN NaN NaN WD Normal \n", "2 TA Y NaN NaN NaN WD Normal \n", "3 TA Y NaN NaN NaN WD Abnorml \n", "4 TA Y NaN NaN NaN WD Normal \n", "... ... ... ... ... ... ... ... \n", "1455 TA Y NaN NaN NaN WD Normal \n", "1456 TA Y NaN MnPrv NaN WD Normal \n", "1457 TA Y NaN GdPrv Shed WD Normal \n", "1458 TA Y NaN NaN NaN WD Normal \n", "1459 TA Y NaN NaN NaN WD Normal \n", "\n", "[1460 rows x 43 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...GarageTypeGarageFinishGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
0RLPaveNaNRegLvlAllPubInsideGtlCollgCrNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
1RLPaveNaNRegLvlAllPubFR2GtlVeenkerFeedr...AttchdRFnTATAYNaNNaNNaNWDNormal
2RLPaveNaNIR1LvlAllPubInsideGtlCollgCrNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
3RLPaveNaNIR1LvlAllPubCornerGtlCrawforNorm...DetchdUnfTATAYNaNNaNNaNWDAbnorml
4RLPaveNaNIR1LvlAllPubFR2GtlNoRidgeNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
..................................................................
1455RLPaveNaNRegLvlAllPubInsideGtlGilbertNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
1456RLPaveNaNRegLvlAllPubInsideGtlNWAmesNorm...AttchdUnfTATAYNaNMnPrvNaNWDNormal
1457RLPaveNaNRegLvlAllPubInsideGtlCrawforNorm...AttchdRFnTATAYNaNGdPrvShedWDNormal
1458RLPaveNaNRegLvlAllPubInsideGtlNAmesNorm...AttchdUnfTATAYNaNNaNNaNWDNormal
1459RLPaveNaNRegLvlAllPubInsideGtlEdwardsNorm...AttchdFinTATAYNaNNaNNaNWDNormal
\n", "

1460 rows × 43 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 96 } ] }, { "cell_type": "code", "execution_count": 97, "id": "075dca0e", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "075dca0e", "outputId": "aaf80dd8-190e-4204-9714-16fab3cf8a19" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['MasVnrArea'], dtype='object')" ] }, "metadata": {}, "execution_count": 97 } ], "source": [ "num_na = numerical.columns[numerical.isnull().any()]\n", "num_na" ] }, { "cell_type": "markdown", "id": "58ba1209", "metadata": { "id": "58ba1209" }, "source": [ "#### Based on domain knowledge, NAs in MasVrArea is replaced with 0" ] }, { "cell_type": "code", "execution_count": 98, "id": "765e417a", "metadata": { "id": "765e417a" }, "outputs": [], "source": [ "for n in [num_na]:\n", " X_start[n] = X_start[n].fillna(0)" ] }, { "cell_type": "code", "execution_count": 99, "id": "87c1a73e", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "87c1a73e", "outputId": "a96e15a8-0e1a-4a2e-e616-63a978124893" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['MasVnrArea', 'TotalBsmtSF'], dtype='object')" ] }, "metadata": {}, "execution_count": 99 } ], "source": [ "t_num_na = t_numerical.columns[t_numerical.isnull().any()]\n", "t_num_na" ] }, { "cell_type": "code", "execution_count": 100, "id": "07bd4e08", "metadata": { "id": "07bd4e08" }, "outputs": [], "source": [ "for n in [t_num_na]:\n", " testset[n] = testset[n].fillna(0)" ] }, { "cell_type": "code", "execution_count": 101, "id": "a003e75b", "metadata": { "scrolled": false, "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "a003e75b", "outputId": "70a52814-9a4d-478a-e2a9-80fb6abef6ee" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope \\\n", "0 RL Pave NaN Reg Lvl AllPub Inside Gtl \n", "1 RL Pave NaN Reg Lvl AllPub FR2 Gtl \n", "2 RL Pave NaN IR1 Lvl AllPub Inside Gtl \n", "3 RL Pave NaN IR1 Lvl AllPub Corner Gtl \n", "4 RL Pave NaN IR1 Lvl AllPub FR2 Gtl \n", "\n", " Neighborhood Condition1 ... GarageType GarageFinish GarageQual GarageCond \\\n", "0 CollgCr Norm ... Attchd RFn TA TA \n", "1 Veenker Feedr ... Attchd RFn TA TA \n", "2 CollgCr Norm ... Attchd RFn TA TA \n", "3 Crawfor Norm ... Detchd Unf TA TA \n", "4 NoRidge Norm ... Attchd RFn TA TA \n", "\n", " PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition \n", "0 Y NaN NaN NaN WD Normal \n", "1 Y NaN NaN NaN WD Normal \n", "2 Y NaN NaN NaN WD Normal \n", "3 Y NaN NaN NaN WD Abnorml \n", "4 Y NaN NaN NaN WD Normal \n", "\n", "[5 rows x 43 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...GarageTypeGarageFinishGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
0RLPaveNaNRegLvlAllPubInsideGtlCollgCrNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
1RLPaveNaNRegLvlAllPubFR2GtlVeenkerFeedr...AttchdRFnTATAYNaNNaNNaNWDNormal
2RLPaveNaNIR1LvlAllPubInsideGtlCollgCrNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
3RLPaveNaNIR1LvlAllPubCornerGtlCrawforNorm...DetchdUnfTATAYNaNNaNNaNWDAbnorml
4RLPaveNaNIR1LvlAllPubFR2GtlNoRidgeNorm...AttchdRFnTATAYNaNNaNNaNWDNormal
\n", "

5 rows × 43 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 101 } ], "source": [ "categorical.head()" ] }, { "cell_type": "markdown", "id": "d02aa749", "metadata": { "id": "d02aa749" }, "source": [ "#### For categorical NAs, they were replaced with None" ] }, { "cell_type": "code", "execution_count": 102, "id": "2345bc44", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2345bc44", "outputId": "b11ba251-bb1c-40e6-fc99-a499c42b8f42" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure',\n", " 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu',\n", " 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC',\n", " 'Fence', 'MiscFeature'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 102 } ], "source": [ "cat_na = categorical.columns[categorical.isnull().any()]\n", "cat_na" ] }, { "cell_type": "code", "execution_count": 103, "id": "76063429", "metadata": { "scrolled": true, "id": "76063429" }, "outputs": [], "source": [ "for c in [cat_na]:\n", " X_start[c] = X_start[c].fillna('None')\n", " categorical[c] = categorical[c].fillna('None')" ] }, { "cell_type": "code", "execution_count": 104, "id": "52ec4ee2", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "52ec4ee2", "outputId": "ace21bb6-2e16-4730-c627-a97dbebffd13" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['MSZoning', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd',\n", " 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',\n", " 'BsmtFinType2', 'KitchenQual', 'Functional', 'FireplaceQu',\n", " 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC',\n", " 'Fence', 'MiscFeature', 'SaleType'],\n", " dtype='object')" ] }, "metadata": {}, "execution_count": 104 } ], "source": [ "t_cat_na = t_categorical.columns[t_categorical.isnull().any()]\n", "t_cat_na" ] }, { "cell_type": "code", "execution_count": 105, "id": "ec3ffa70", "metadata": { "id": "ec3ffa70" }, "outputs": [], "source": [ "for c in [t_cat_na]:\n", " testset[c] = testset[c].fillna('None')\n", " t_categorical[c] = t_categorical[c].fillna('None')" ] }, { "cell_type": "code", "execution_count": 106, "id": "ed9753fe", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "ed9753fe", "outputId": "b995fdcf-cf74-425e-b574-b66f517b5359" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope \\\n", "0 RL Pave None Reg Lvl AllPub Inside Gtl \n", "1 RL Pave None Reg Lvl AllPub FR2 Gtl \n", "2 RL Pave None IR1 Lvl AllPub Inside Gtl \n", "3 RL Pave None IR1 Lvl AllPub Corner Gtl \n", "4 RL Pave None IR1 Lvl AllPub FR2 Gtl \n", "\n", " Neighborhood Condition1 ... GarageType GarageFinish GarageQual GarageCond \\\n", "0 CollgCr Norm ... Attchd RFn TA TA \n", "1 Veenker Feedr ... Attchd RFn TA TA \n", "2 CollgCr Norm ... Attchd RFn TA TA \n", "3 Crawfor Norm ... Detchd Unf TA TA \n", "4 NoRidge Norm ... Attchd RFn TA TA \n", "\n", " PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition \n", "0 Y None None None WD Normal \n", "1 Y None None None WD Normal \n", "2 Y None None None WD Normal \n", "3 Y None None None WD Abnorml \n", "4 Y None None None WD Normal \n", "\n", "[5 rows x 43 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...GarageTypeGarageFinishGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
0RLPaveNoneRegLvlAllPubInsideGtlCollgCrNorm...AttchdRFnTATAYNoneNoneNoneWDNormal
1RLPaveNoneRegLvlAllPubFR2GtlVeenkerFeedr...AttchdRFnTATAYNoneNoneNoneWDNormal
2RLPaveNoneIR1LvlAllPubInsideGtlCollgCrNorm...AttchdRFnTATAYNoneNoneNoneWDNormal
3RLPaveNoneIR1LvlAllPubCornerGtlCrawforNorm...DetchdUnfTATAYNoneNoneNoneWDAbnorml
4RLPaveNoneIR1LvlAllPubFR2GtlNoRidgeNorm...AttchdRFnTATAYNoneNoneNoneWDNormal
\n", "

5 rows × 43 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 106 } ], "source": [ "categorical.head()" ] }, { "cell_type": "markdown", "id": "08eb4efb", "metadata": { "id": "08eb4efb" }, "source": [ "#### Checking to see if they are any NAs left" ] }, { "cell_type": "code", "execution_count": 107, "id": "68242cc3", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "68242cc3", "outputId": "4a8ecee4-5b9f-4796-c22f-5881561c9718" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "False" ] }, "metadata": {}, "execution_count": 107 } ], "source": [ "X_start.isnull().values.any()" ] }, { "cell_type": "code", "execution_count": 108, "id": "fff1637b", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fff1637b", "outputId": "353178ed-ae97-47fe-f153-9e23ca29fd8a" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "False" ] }, "metadata": {}, "execution_count": 108 } ], "source": [ "testset.isnull().values.any()" ] }, { "cell_type": "markdown", "id": "e572f249", "metadata": { "id": "e572f249" }, "source": [ "#### Label encoding of categorical data" ] }, { "cell_type": "code", "execution_count": 109, "id": "13b81bb3", "metadata": { "scrolled": true, "id": "13b81bb3" }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "le = LabelEncoder()\n", "\n", "for feature in categorical.columns:\n", " X_start[feature]=le.fit_transform(X_start[feature])\n", " categorical[feature]=le.fit_transform(categorical[feature])\n", " testset[feature]=le.fit_transform(testset[feature])\n", " t_categorical[feature]=le.fit_transform(t_categorical[feature])" ] }, { "cell_type": "code", "execution_count": 110, "id": "eef8457a", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/", "height": 0 }, "id": "eef8457a", "outputId": "39ced0af-35f8-4a0c-e91b-713805ed6dc6" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MSZoning Street Alley LotShape LandContour Utilities LotConfig \\\n", "0 3 1 1 3 3 0 4 \n", "1 3 1 1 3 3 0 2 \n", "2 3 1 1 0 3 0 4 \n", "3 3 1 1 0 3 0 0 \n", "4 3 1 1 0 3 0 2 \n", "\n", " LandSlope Neighborhood Condition1 ... GarageType GarageFinish \\\n", "0 0 5 2 ... 1 2 \n", "1 0 24 1 ... 1 2 \n", "2 0 5 2 ... 1 2 \n", "3 0 6 2 ... 5 3 \n", "4 0 15 2 ... 1 2 \n", "\n", " GarageQual GarageCond PavedDrive PoolQC Fence MiscFeature SaleType \\\n", "0 5 5 2 3 4 1 8 \n", "1 5 5 2 3 4 1 8 \n", "2 5 5 2 3 4 1 8 \n", "3 5 5 2 3 4 1 8 \n", "4 5 5 2 3 4 1 8 \n", "\n", " SaleCondition \n", "0 4 \n", "1 4 \n", "2 4 \n", "3 0 \n", "4 4 \n", "\n", "[5 rows x 43 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...GarageTypeGarageFinishGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
03113304052...1255234184
131133020241...1255234184
23110304052...1255234184
33110300062...5355234180
431103020152...1255234184
\n", "

5 rows × 43 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 110 } ], "source": [ "categorical.head()" ] }, { "cell_type": "markdown", "id": "e81d0983", "metadata": { "id": "e81d0983" }, "source": [ "#### Checking for correlation between categorial columns and dropping highly correlated items based on domain knowledge" ] }, { "cell_type": "code", "execution_count": 111, "id": "d41f8b8a", "metadata": { "id": "d41f8b8a" }, "outputs": [], "source": [ "cat_corr = categorical.corr().abs()\n", "cat_corr_list = (cat_corr.where(np.triu(np.ones(cat_corr.shape), k=1).astype(bool))\n", " .stack())\n", "high_cat_corr = cat_corr_list.loc[cat_corr_list > 0.5]" ] }, { "cell_type": "code", "execution_count": 112, "id": "39405297", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/" }, "id": "39405297", "outputId": "6ef3002b-9547-41e5-8071-5ba2e321de6c" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Exterior1st Exterior2nd 0.854163\n", "ExterQual BsmtQual 0.572327\n", " KitchenQual 0.641584\n", "GarageQual GarageCond 0.618383\n", "dtype: float64" ] }, "metadata": {}, "execution_count": 112 } ], "source": [ "high_cat_corr" ] }, { "cell_type": "code", "execution_count": 113, "id": "1b8d38aa", "metadata": { "id": "1b8d38aa" }, "outputs": [], "source": [ "drop_hcc = ['GarageCond', 'BsmtQual', 'KitchenQual', 'Exterior2nd']" ] }, { "cell_type": "code", "execution_count": 114, "id": "afd1fc17", "metadata": { "id": "afd1fc17" }, "outputs": [], "source": [ "X_start = X_start.drop(drop_hcc, axis = 1)\n", "testset = testset.drop(drop_hcc, axis = 1)" ] }, { "cell_type": "code", "execution_count": 115, "id": "6cdfd930", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "6cdfd930", "outputId": "562d6caf-1b61-484d-ea66-6335c2ffee85" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " MSZoning Street Alley LotShape LandContour Utilities LotConfig \\\n", "0 3 1 1 3 3 0 4 \n", "1 3 1 1 3 3 0 2 \n", "2 3 1 1 0 3 0 4 \n", "3 3 1 1 0 3 0 0 \n", "4 3 1 1 0 3 0 2 \n", "\n", " LandSlope Neighborhood Condition1 ... FireplaceQu GarageType \\\n", "0 0 5 2 ... 3 1 \n", "1 0 24 1 ... 5 1 \n", "2 0 5 2 ... 5 1 \n", "3 0 6 2 ... 2 5 \n", "4 0 15 2 ... 5 1 \n", "\n", " GarageFinish GarageQual PavedDrive PoolQC Fence MiscFeature SaleType \\\n", "0 2 5 2 3 4 1 8 \n", "1 2 5 2 3 4 1 8 \n", "2 2 5 2 3 4 1 8 \n", "3 3 5 2 3 4 1 8 \n", "4 2 5 2 3 4 1 8 \n", "\n", " SaleCondition \n", "0 4 \n", "1 4 \n", "2 4 \n", "3 0 \n", "4 4 \n", "\n", "[5 rows x 44 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSZoningStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1...FireplaceQuGarageTypeGarageFinishGarageQualPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
03113304052...3125234184
131133020241...5125234184
23110304052...5125234184
33110300062...2535234180
431103020152...5125234184
\n", "

5 rows × 44 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 115 } ], "source": [ "X_start.head()" ] }, { "cell_type": "markdown", "id": "e30cfa3b", "metadata": { "id": "e30cfa3b" }, "source": [ "#### Normalizing data" ] }, { "cell_type": "code", "execution_count": 116, "id": "b8b4eb75", "metadata": { "id": "b8b4eb75" }, "outputs": [], "source": [ "X_start = (X_start - X_start.min()) / (X_start.max() - X_start.min())\n", "testset = (testset - testset.min()) / (testset.max() - testset.min())" ] }, { "cell_type": "code", "execution_count": 117, "id": "ea423b42", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "ea423b42", "outputId": "1565a2c5-c21d-4f36-ddcf-53dfcaa5a982" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "0 1 60 RL 65.0 8450 Pave NaN Reg \n", "1 2 20 RL 80.0 9600 Pave NaN Reg \n", "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", "\n", " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold \\\n", "0 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "1 Lvl AllPub ... 0 NaN NaN NaN 0 5 \n", "2 Lvl AllPub ... 0 NaN NaN NaN 0 9 \n", "3 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "4 Lvl AllPub ... 0 NaN NaN NaN 0 12 \n", "\n", " YrSold SaleType SaleCondition SalePrice \n", "0 2008 WD Normal 208500 \n", "1 2007 WD Normal 181500 \n", "2 2008 WD Normal 223500 \n", "3 2006 WD Abnorml 140000 \n", "4 2008 WD Normal 250000 \n", "\n", "[5 rows x 81 columns]" ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
\n", "

5 rows × 81 columns

\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 117 } ], "source": [ "dataset.head()" ] }, { "cell_type": "markdown", "id": "8c19de74", "metadata": { "id": "8c19de74" }, "source": [ "#### Using Decision Tree (Random Forest) to selected the 10 best features" ] }, { "cell_type": "code", "execution_count": 118, "id": "66b2d593", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "66b2d593", "outputId": "4e4f7165-7d3e-4323-a138-27d61c8fd693" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "RandomForestRegressor(max_depth=10, random_state=1)" ], "text/html": [ "
RandomForestRegressor(max_depth=10, random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 118 } ], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "model = RandomForestRegressor(random_state=1, max_depth=10)\n", "model.fit(X_start,y)" ] }, { "cell_type": "code", "execution_count": 119, "id": "adbbd88b", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/", "height": 472 }, "id": "adbbd88b", "outputId": "63a5ce35-c62a-496d-9a72-8087d3a11bd9" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ], "source": [ "features = X_start.columns\n", "importances = model.feature_importances_\n", "indices = np.argsort(importances)[-10:] # top 10 features\n", "plt.title('Feature Importances')\n", "plt.barh(range(len(indices)), importances[indices], color='b', align='center')\n", "plt.yticks(range(len(indices)), [features[i] for i in indices])\n", "plt.xlabel('Relative Importance')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 120, "id": "4a35608d", "metadata": { "id": "4a35608d" }, "outputs": [], "source": [ "feat = dict(reversed(sorted(zip(model.feature_importances_, X_start.columns.values))))" ] }, { "cell_type": "code", "execution_count": 121, "id": "ef61f8c5", "metadata": { "id": "ef61f8c5" }, "outputs": [], "source": [ "feat10 = [feat[x] for x in list(feat)[:10]]" ] }, { "cell_type": "code", "execution_count": 122, "id": "801cbef5", "metadata": { "id": "801cbef5" }, "outputs": [], "source": [ "t_drop = [feat[x] for x in list(feat)[10:]]" ] }, { "cell_type": "code", "execution_count": 123, "id": "ced0290a", "metadata": { "id": "ced0290a" }, "outputs": [], "source": [ "for i in t_drop:\n", " testset.drop(i, axis = 1, inplace = True)" ] }, { "cell_type": "code", "execution_count": 124, "id": "8e3418e7", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8e3418e7", "outputId": "bdbb2285-717d-497e-ce76-17691782afb5" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(1459, 10)" ] }, "metadata": {}, "execution_count": 124 } ], "source": [ "testset.shape" ] }, { "cell_type": "code", "source": [ "testset.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "i02xm_67CLy5", "outputId": "168fecbc-a1ae-4b41-f7d9-e1295aca186f" }, "id": "i02xm_67CLy5", "execution_count": 125, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Neighborhood OverallQual YearBuilt MasVnrArea BsmtExposure \\\n", "0 0.500000 0.444444 0.625954 0.000000 0.75 \n", "1 0.500000 0.555556 0.603053 0.083721 0.75 \n", "2 0.333333 0.444444 0.900763 0.000000 0.75 \n", "3 0.333333 0.555556 0.908397 0.015504 0.75 \n", "4 0.916667 0.777778 0.862595 0.000000 0.75 \n", "\n", " BsmtFinType1 TotalBsmtSF GrLivArea GarageType SaleCondition \n", "0 0.833333 0.173111 0.104309 0.166667 0.8 \n", "1 0.000000 0.260844 0.196672 0.166667 0.8 \n", "2 0.333333 0.182139 0.260666 0.166667 0.8 \n", "3 0.333333 0.181747 0.255333 0.166667 0.8 \n", "4 0.000000 0.251227 0.186220 0.166667 0.8 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NeighborhoodOverallQualYearBuiltMasVnrAreaBsmtExposureBsmtFinType1TotalBsmtSFGrLivAreaGarageTypeSaleCondition
00.5000000.4444440.6259540.0000000.750.8333330.1731110.1043090.1666670.8
10.5000000.5555560.6030530.0837210.750.0000000.2608440.1966720.1666670.8
20.3333330.4444440.9007630.0000000.750.3333330.1821390.2606660.1666670.8
30.3333330.5555560.9083970.0155040.750.3333330.1817470.2553330.1666670.8
40.9166670.7777780.8625950.0000000.750.0000000.2512270.1862200.1666670.8
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 125 } ] }, { "cell_type": "code", "execution_count": 126, "id": "6a091d90", "metadata": { "id": "6a091d90" }, "outputs": [], "source": [ "X = X_start[feat10].copy()" ] }, { "cell_type": "code", "execution_count": 127, "id": "7c9dcad9", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "7c9dcad9", "outputId": "741b7fc7-aff1-46a8-fb96-a1cb3668cb35" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " OverallQual GrLivArea TotalBsmtSF YearBuilt MasVnrArea Neighborhood \\\n", "0 0.666667 0.259231 0.140098 0.949275 0.12250 0.208333 \n", "1 0.555556 0.174830 0.206547 0.753623 0.00000 1.000000 \n", "2 0.666667 0.273549 0.150573 0.934783 0.10125 0.208333 \n", "3 0.666667 0.260550 0.123732 0.311594 0.00000 0.250000 \n", "4 0.777778 0.351168 0.187398 0.927536 0.21875 0.625000 \n", "\n", " BsmtFinType1 GarageType BsmtExposure SaleCondition \n", "0 0.333333 0.166667 0.75 0.8 \n", "1 0.000000 0.166667 0.25 0.8 \n", "2 0.333333 0.166667 0.50 0.8 \n", "3 0.000000 0.833333 0.75 0.0 \n", "4 0.333333 0.166667 0.00 0.8 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OverallQualGrLivAreaTotalBsmtSFYearBuiltMasVnrAreaNeighborhoodBsmtFinType1GarageTypeBsmtExposureSaleCondition
00.6666670.2592310.1400980.9492750.122500.2083330.3333330.1666670.750.8
10.5555560.1748300.2065470.7536230.000001.0000000.0000000.1666670.250.8
20.6666670.2735490.1505730.9347830.101250.2083330.3333330.1666670.500.8
30.6666670.2605500.1237320.3115940.000000.2500000.0000000.8333330.750.0
40.7777780.3511680.1873980.9275360.218750.6250000.3333330.1666670.000.8
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 127 } ], "source": [ "X.head()" ] }, { "cell_type": "code", "execution_count": 128, "id": "6a79fe62", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6a79fe62", "outputId": "5e046db8-5fb9-4307-9b2b-92fcea21f6ef" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([208500, 181500, 223500, ..., 266500, 142125, 147500])" ] }, "metadata": {}, "execution_count": 128 } ], "source": [ "y" ] }, { "cell_type": "markdown", "id": "514a6436", "metadata": { "id": "514a6436" }, "source": [ "#### Spiliting the training data for testing purposes" ] }, { "cell_type": "code", "execution_count": 129, "id": "c2f3a67e", "metadata": { "id": "c2f3a67e" }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)" ] }, { "cell_type": "code", "source": [ "X_train.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "_GSLonqaM1Qu", "outputId": "5e3c8d2c-dcc9-4e95-9136-7a07ee890a07" }, "id": "_GSLonqaM1Qu", "execution_count": 134, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " OverallQual GrLivArea TotalBsmtSF YearBuilt MasVnrArea Neighborhood \\\n", "618 0.888889 0.281462 0.298200 0.978261 0.28250 0.666667 \n", "870 0.444444 0.105501 0.146318 0.652174 0.00000 0.500000 \n", "92 0.444444 0.118689 0.143372 0.355072 0.00000 0.250000 \n", "817 0.777778 0.255275 0.256628 0.942029 0.09250 0.458333 \n", "302 0.666667 0.227393 0.252209 0.934783 0.09375 0.208333 \n", "\n", " BsmtFinType1 GarageType BsmtExposure SaleCondition \n", "618 0.333333 0.166667 0.00 1.0 \n", "870 1.000000 0.833333 0.75 0.8 \n", "92 0.000000 0.833333 0.75 0.8 \n", "817 0.333333 0.166667 0.75 0.8 \n", "302 1.000000 0.166667 0.75 0.8 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OverallQualGrLivAreaTotalBsmtSFYearBuiltMasVnrAreaNeighborhoodBsmtFinType1GarageTypeBsmtExposureSaleCondition
6180.8888890.2814620.2982000.9782610.282500.6666670.3333330.1666670.001.0
8700.4444440.1055010.1463180.6521740.000000.5000001.0000000.8333330.750.8
920.4444440.1186890.1433720.3550720.000000.2500000.0000000.8333330.750.8
8170.7777780.2552750.2566280.9420290.092500.4583330.3333330.1666670.750.8
3020.6666670.2273930.2522090.9347830.093750.2083331.0000000.1666670.750.8
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 134 } ] }, { "cell_type": "markdown", "source": [ "##Loading the picked Models" ], "metadata": { "id": "daUHUmz_MMUm" }, "id": "daUHUmz_MMUm" }, { "cell_type": "code", "source": [ "# load trained models\n", "lgbm_base = load(open('lgbm_base.pkl', 'rb'))\n", "lgbm_opt = load(open('lgbm_optimized.pkl', 'rb'))\n", "lgbm_base_1 = load(open('lgbm_base_1.pkl', 'rb'))" ], "metadata": { "id": "XmNDC-oDMPHG" }, "id": "XmNDC-oDMPHG", "execution_count": 147, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Testing the pickled Models" ], "metadata": { "id": "WZt3ONduMGKf" }, "id": "WZt3ONduMGKf" }, { "cell_type": "code", "source": [ "y_pred_base = lgbm_base.predict(testset)" ], "metadata": { "id": "rr5lBv99MFxc" }, "id": "rr5lBv99MFxc", "execution_count": 135, "outputs": [] }, { "cell_type": "code", "source": [ "y_pred_base" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zw-gobEgM5DG", "outputId": "7335bb77-a1e0-48ae-cbd3-eb2d6ad59ffa" }, "id": "Zw-gobEgM5DG", "execution_count": 136, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([255262.38181554, 260859.24936829, 204412.97596693, ...,\n", " 199498.24177768, 188655.7072259 , 234728.50900671])" ] }, "metadata": {}, "execution_count": 136 } ] }, { "cell_type": "code", "source": [ "y_pred_opt = lgbm_opt.predict(testset)" ], "metadata": { "id": "SPYHY4-DODwm" }, "id": "SPYHY4-DODwm", "execution_count": 142, "outputs": [] }, { "cell_type": "code", "source": [ "y_pred_opt" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fW90b8UVNjyR", "outputId": "a9a15276-9311-4ed9-a59d-48c6cc67fca0" }, "id": "fW90b8UVNjyR", "execution_count": 143, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([269827.78722226, 286761.38304113, 262921.97161172, ...,\n", " 248499.95411655, 215731.19961257, 232258.18233653])" ] }, "metadata": {}, "execution_count": 143 } ] }, { "cell_type": "markdown", "source": [ "Testing for the Hugging Face App" ], "metadata": { "id": "0o2FFOsWONHL" }, "id": "0o2FFOsWONHL" }, { "cell_type": "code", "source": [ "type(testset.head(1))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sYs300iRPMRS", "outputId": "1f190056-2800-467c-ce03-185af502e380" }, "id": "sYs300iRPMRS", "execution_count": 146, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "metadata": {}, "execution_count": 146 } ] }, { "cell_type": "code", "source": [ "lgbm_base_1.predict(testset.head(1))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3vXTtZDLNuI4", "outputId": "38ef4168-157f-4680-ad9b-c4c2f54514a5" }, "id": "3vXTtZDLNuI4", "execution_count": 148, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([255262.38181554])" ] }, "metadata": {}, "execution_count": 148 } ] }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "\n", "scaler = MinMaxScaler()" ], "metadata": { "id": "YOkvZeIYQbKx" }, "id": "YOkvZeIYQbKx", "execution_count": 149, "outputs": [] }, { "cell_type": "code", "source": [ "name_list = [\n", " 'OverallQual',\n", " 'YearBuilt',\n", " 'TotalBsmtSF',\n", " 'GrLivArea',\n", " 'MasVnrArea',\n", " 'BsmtFinType1',\n", " 'Neighborhood',\n", " 'GarageType',\n", " 'SaleCondition',\n", " 'BsmtExposure']" ], "metadata": { "id": "ANSLYjz5Q40M" }, "id": "ANSLYjz5Q40M", "execution_count": 150, "outputs": [] }, { "cell_type": "code", "source": [ "min_list = [\n", " 1.0,\n", " 1950.0,\n", " 0.0,\n", " 0.0,\n", " 334.0,\n", " 1.0,\n", " 1.0,\n", " 1.0,\n", " 1.0,\n", " 0.0\n", "]\n", "\n", "max_list = [\n", " 10.0,\n", " 2010.0,\n", " 2336.0,\n", " 6110.0,\n", " 4692.0,\n", " 7.0,\n", " 25.0,\n", " 7.0,\n", " 6.0,\n", " 5.0,\n", "]" ], "metadata": { "id": "DBt2DkIrQ86k" }, "id": "DBt2DkIrQ86k", "execution_count": 151, "outputs": [] }, { "cell_type": "code", "source": [ "data_df = {\n", "\n", " 'OverallQual': [max_list[0]],\n", " 'YearBuilt': [max_list[1]],\n", " 'TotalBsmtSF': [max_list[2]],\n", " 'GrLivArea':[max_list[3]],\n", " 'MasVnrArea': [max_list[4]],\n", " 'BsmtFinType1': [max_list[5]],\n", " 'Neighborhood': [max_list[6]],\n", " 'GarageType': [max_list[7]],\n", " 'SaleCondition': [max_list[8]],\n", " 'BsmtExposure': [max_list[9]]\n", "}\n", "\n", "data_df = pd.DataFrame.from_dict(data_df)" ], "metadata": { "id": "8S5dIViJQ-EO" }, "id": "8S5dIViJQ-EO", "execution_count": 187, "outputs": [] }, { "cell_type": "code", "source": [ "data_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 81 }, "id": "5viM2IM-RahN", "outputId": "9d84fe53-d614-408a-ec57-832ed13ce65c" }, "id": "5viM2IM-RahN", "execution_count": 188, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " OverallQual YearBuilt TotalBsmtSF GrLivArea MasVnrArea BsmtFinType1 \\\n", "0 10.0 2010.0 2336.0 6110.0 4692.0 7.0 \n", "\n", " Neighborhood GarageType SaleCondition BsmtExposure \n", "0 25.0 7.0 6.0 5.0 " ], "text/html": [ "\n", "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OverallQualYearBuiltTotalBsmtSFGrLivAreaMasVnrAreaBsmtFinType1NeighborhoodGarageTypeSaleConditionBsmtExposure
010.02010.02336.06110.04692.07.025.07.06.05.0
\n", "
\n", " \n", "\n", "\n", "\n", "
\n", " \n", "
\n", "\n", "\n", "\n", " \n", "\n", " \n", " \n", "\n", " \n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 188 } ] }, { "cell_type": "code", "source": [ "data_df.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0l_5ZozXSa0H", "outputId": "8f69cac1-428d-444e-c21b-f2ee2336c995" }, "id": "0l_5ZozXSa0H", "execution_count": 189, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 1 entries, 0 to 0\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 OverallQual 1 non-null float64\n", " 1 YearBuilt 1 non-null float64\n", " 2 TotalBsmtSF 1 non-null float64\n", " 3 GrLivArea 1 non-null float64\n", " 4 MasVnrArea 1 non-null float64\n", " 5 BsmtFinType1 1 non-null float64\n", " 6 Neighborhood 1 non-null float64\n", " 7 GarageType 1 non-null float64\n", " 8 SaleCondition 1 non-null float64\n", " 9 BsmtExposure 1 non-null float64\n", "dtypes: float64(10)\n", "memory usage: 208.0 bytes\n" ] } ] }, { "cell_type": "code", "source": [ "d = data_df.head(1).values" ], "metadata": { "id": "aWOrIXssUFIB" }, "id": "aWOrIXssUFIB", "execution_count": 190, "outputs": [] }, { "cell_type": "markdown", "source": [ "Normalizing the data" ], "metadata": { "id": "XbRs-qwhSWlw" }, "id": "XbRs-qwhSWlw" }, { "cell_type": "code", "source": [ "diff = np.array(max_list)-np.array(min_list)\n", "diff" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bkpfncqYXary", "outputId": "b62bdc10-e0d7-4519-9657-bb2098d5e460" }, "id": "bkpfncqYXary", "execution_count": 191, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([9.000e+00, 6.000e+01, 2.336e+03, 6.110e+03, 4.358e+03, 6.000e+00,\n", " 2.400e+01, 6.000e+00, 5.000e+00, 5.000e+00])" ] }, "metadata": {}, "execution_count": 191 } ] }, { "cell_type": "code", "source": [ "data_df = (data_df.values - np.array(min_list)) / diff" ], "metadata": { "id": "PLkzas6uR170" }, "id": "PLkzas6uR170", "execution_count": 192, "outputs": [] }, { "cell_type": "code", "source": [ "data_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "m9UHje7uSVmP", "outputId": "1c224c94-79da-4819-b8af-36ae268e9950" }, "id": "m9UHje7uSVmP", "execution_count": 193, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])" ] }, "metadata": {}, "execution_count": 193 } ] }, { "cell_type": "code", "source": [ "lgbm_base_1.predict(data_df)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4fV7XrO-RhZK", "outputId": "56b14175-e06a-4936-9877-ceaefeff8efd" }, "id": "4fV7XrO-RhZK", "execution_count": 194, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([489729.86565528])" ] }, "metadata": {}, "execution_count": 194 } ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "colab": { "provenance": [], "collapsed_sections": [ "8Vjfbt-sDp13", "ce7d0a75", "43ab061c", "42da68a9", "4d3cd6a1", "f1827825", "58ba1209", "d02aa749", "08eb4efb", "e572f249", "8c19de74", "5KMnVh6V-UZw" ] } }, "nbformat": 4, "nbformat_minor": 5 }