{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyPmt2lmbu+FwSqN/2ioK1mu" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "I6tnizR9KmGN" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n" ] }, { "cell_type": "code", "source": [ "car = pd.read_csv('https://raw.githubusercontent.com/rajtilakls2510/car_price_predictor/master/quikr_car.csv')\n", "car.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "_QB98RY-L_DI", "outputId": "85adff85-d29f-46d3-9b5c-c721a0dcc7f8" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " name company year Price \\\n", "0 Hyundai Santro Xing XO eRLX Euro III Hyundai 2007 80,000 \n", "1 Mahindra Jeep CL550 MDI Mahindra 2006 4,25,000 \n", "2 Maruti Suzuki Alto 800 Vxi Maruti 2018 Ask For Price \n", "3 Hyundai Grand i10 Magna 1.2 Kappa VTVT Hyundai 2014 3,25,000 \n", "4 Ford EcoSport Titanium 1.5L TDCi Ford 2014 5,75,000 \n", "\n", " kms_driven fuel_type \n", "0 45,000 kms Petrol \n", "1 40 kms Diesel \n", "2 22,000 kms Petrol \n", "3 28,000 kms Petrol \n", "4 36,000 kms Diesel " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecompanyyearPricekms_drivenfuel_type
0Hyundai Santro Xing XO eRLX Euro IIIHyundai200780,00045,000 kmsPetrol
1Mahindra Jeep CL550 MDIMahindra20064,25,00040 kmsDiesel
2Maruti Suzuki Alto 800 VxiMaruti2018Ask For Price22,000 kmsPetrol
3Hyundai Grand i10 Magna 1.2 Kappa VTVTHyundai20143,25,00028,000 kmsPetrol
4Ford EcoSport Titanium 1.5L TDCiFord20145,75,00036,000 kmsDiesel
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 2 } ] }, { "cell_type": "code", "source": [ "car.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Wwy0Ve_wNG9M", "outputId": "43958f8c-46a1-4274-9c6b-7fb0f4e9c1d3" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(892, 6)" ] }, "metadata": {}, "execution_count": 3 } ] }, { "cell_type": "code", "source": [ "car.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Xh6mrkM0NLeU", "outputId": "e2b7a481-1583-46aa-e547-7da1f3d6ae82" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 892 entries, 0 to 891\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 name 892 non-null object\n", " 1 company 892 non-null object\n", " 2 year 892 non-null object\n", " 3 Price 892 non-null object\n", " 4 kms_driven 840 non-null object\n", " 5 fuel_type 837 non-null object\n", "dtypes: object(6)\n", "memory usage: 41.9+ KB\n" ] } ] }, { "cell_type": "code", "source": [ "car['year'].unique()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1U_wrin0NO0F", "outputId": "7ec4499e-c071-4ae0-9ea6-7a854185852e" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016',\n", " '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',\n", " '...', '150k', 'TOUR', '2003', 'r 15', '2004', 'Zest', '/-Rs',\n", " 'sale', '1995', 'ara)', '2002', 'SELL', '2001', 'tion', 'odel',\n", " '2 bs', 'arry', 'Eon', 'o...', 'ture', 'emi', 'car', 'able', 'no.',\n", " 'd...', 'SALE', 'digo', 'sell', 'd Ex', 'n...', 'e...', 'D...',\n", " ', Ac', 'go .', 'k...', 'o c4', 'zire', 'cent', 'Sumo', 'cab',\n", " 't xe', 'EV2', 'r...', 'zest'], dtype=object)" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "markdown", "source": [ "In year column has alphbet values (Non-year value)" ], "metadata": { "id": "y93Rrz_YNhKo" } }, { "cell_type": "code", "source": [ "car['Price'].unique()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "H_n8jcMMNbfk", "outputId": "10cd6ade-1ba5-4998-d3ef-b00c00e4cc39" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['80,000', '4,25,000', 'Ask For Price', '3,25,000', '5,75,000',\n", " '1,75,000', '1,90,000', '8,30,000', '2,50,000', '1,82,000',\n", " '3,15,000', '4,15,000', '3,20,000', '10,00,000', '5,00,000',\n", " '3,50,000', '1,60,000', '3,10,000', '75,000', '1,00,000',\n", " '2,90,000', '95,000', '1,80,000', '3,85,000', '1,05,000',\n", " '6,50,000', '6,89,999', '4,48,000', '5,49,000', '5,01,000',\n", " '4,89,999', '2,80,000', '3,49,999', '2,84,999', '3,45,000',\n", " '4,99,999', '2,35,000', '2,49,999', '14,75,000', '3,95,000',\n", " '2,20,000', '1,70,000', '85,000', '2,00,000', '5,70,000',\n", " '1,10,000', '4,48,999', '18,91,111', '1,59,500', '3,44,999',\n", " '4,49,999', '8,65,000', '6,99,000', '3,75,000', '2,24,999',\n", " '12,00,000', '1,95,000', '3,51,000', '2,40,000', '90,000',\n", " '1,55,000', '6,00,000', '1,89,500', '2,10,000', '3,90,000',\n", " '1,35,000', '16,00,000', '7,01,000', '2,65,000', '5,25,000',\n", " '3,72,000', '6,35,000', '5,50,000', '4,85,000', '3,29,500',\n", " '2,51,111', '5,69,999', '69,999', '2,99,999', '3,99,999',\n", " '4,50,000', '2,70,000', '1,58,400', '1,79,000', '1,25,000',\n", " '2,99,000', '1,50,000', '2,75,000', '2,85,000', '3,40,000',\n", " '70,000', '2,89,999', '8,49,999', '7,49,999', '2,74,999',\n", " '9,84,999', '5,99,999', '2,44,999', '4,74,999', '2,45,000',\n", " '1,69,500', '3,70,000', '1,68,000', '1,45,000', '98,500',\n", " '2,09,000', '1,85,000', '9,00,000', '6,99,999', '1,99,999',\n", " '5,44,999', '1,99,000', '5,40,000', '49,000', '7,00,000', '55,000',\n", " '8,95,000', '3,55,000', '5,65,000', '3,65,000', '40,000',\n", " '4,00,000', '3,30,000', '5,80,000', '3,79,000', '2,19,000',\n", " '5,19,000', '7,30,000', '20,00,000', '21,00,000', '14,00,000',\n", " '3,11,000', '8,55,000', '5,35,000', '1,78,000', '3,00,000',\n", " '2,55,000', '5,49,999', '3,80,000', '57,000', '4,10,000',\n", " '2,25,000', '1,20,000', '59,000', '5,99,000', '6,75,000', '72,500',\n", " '6,10,000', '2,30,000', '5,20,000', '5,24,999', '4,24,999',\n", " '6,44,999', '5,84,999', '7,99,999', '4,44,999', '6,49,999',\n", " '9,44,999', '5,74,999', '3,74,999', '1,30,000', '4,01,000',\n", " '13,50,000', '1,74,999', '2,39,999', '99,999', '3,24,999',\n", " '10,74,999', '11,30,000', '1,49,000', '7,70,000', '30,000',\n", " '3,35,000', '3,99,000', '65,000', '1,69,999', '1,65,000',\n", " '5,60,000', '9,50,000', '7,15,000', '45,000', '9,40,000',\n", " '1,55,555', '15,00,000', '4,95,000', '8,00,000', '12,99,000',\n", " '5,30,000', '14,99,000', '32,000', '4,05,000', '7,60,000',\n", " '7,50,000', '4,19,000', '1,40,000', '15,40,000', '1,23,000',\n", " '4,98,000', '4,80,000', '4,88,000', '15,25,000', '5,48,900',\n", " '7,25,000', '99,000', '52,000', '28,00,000', '4,99,000',\n", " '3,81,000', '2,78,000', '6,90,000', '2,60,000', '90,001',\n", " '1,15,000', '15,99,000', '1,59,000', '51,999', '2,15,000',\n", " '35,000', '11,50,000', '2,69,000', '60,000', '4,30,000',\n", " '85,00,003', '4,01,919', '4,90,000', '4,24,000', '2,05,000',\n", " '5,49,900', '3,71,500', '4,35,000', '1,89,700', '3,89,700',\n", " '3,60,000', '2,95,000', '1,14,990', '10,65,000', '4,70,000',\n", " '48,000', '1,88,000', '4,65,000', '1,79,999', '21,90,000',\n", " '23,90,000', '10,75,000', '4,75,000', '10,25,000', '6,15,000',\n", " '19,00,000', '14,90,000', '15,10,000', '18,50,000', '7,90,000',\n", " '17,25,000', '12,25,000', '68,000', '9,70,000', '31,00,000',\n", " '8,99,000', '88,000', '53,000', '5,68,500', '71,000', '5,90,000',\n", " '7,95,000', '42,000', '1,89,000', '1,62,000', '35,999',\n", " '29,00,000', '39,999', '50,500', '5,10,000', '8,60,000',\n", " '5,00,001'], dtype=object)" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "markdown", "source": [ "Price column has 'Ask for price' string" ], "metadata": { "id": "VYVx8iFOOG5K" } }, { "cell_type": "code", "source": [ "car['kms_driven'].unique()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UUVyLDZ2ODAc", "outputId": "82a219be-00b2-46b3-c279-324ecca81c06" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['45,000 kms', '40 kms', '22,000 kms', '28,000 kms', '36,000 kms',\n", " '59,000 kms', '41,000 kms', '25,000 kms', '24,530 kms',\n", " '60,000 kms', '30,000 kms', '32,000 kms', '48,660 kms',\n", " '4,000 kms', '16,934 kms', '43,000 kms', '35,550 kms',\n", " '39,522 kms', '39,000 kms', '55,000 kms', '72,000 kms',\n", " '15,975 kms', '70,000 kms', '23,452 kms', '35,522 kms',\n", " '48,508 kms', '15,487 kms', '82,000 kms', '20,000 kms',\n", " '68,000 kms', '38,000 kms', '27,000 kms', '33,000 kms',\n", " '46,000 kms', '16,000 kms', '47,000 kms', '35,000 kms',\n", " '30,874 kms', '15,000 kms', '29,685 kms', '1,30,000 kms',\n", " '19,000 kms', nan, '54,000 kms', '13,000 kms', '38,200 kms',\n", " '50,000 kms', '13,500 kms', '3,600 kms', '45,863 kms',\n", " '60,500 kms', '12,500 kms', '18,000 kms', '13,349 kms',\n", " '29,000 kms', '44,000 kms', '42,000 kms', '14,000 kms',\n", " '49,000 kms', '36,200 kms', '51,000 kms', '1,04,000 kms',\n", " '33,333 kms', '33,600 kms', '5,600 kms', '7,500 kms', '26,000 kms',\n", " '24,330 kms', '65,480 kms', '28,028 kms', '2,00,000 kms',\n", " '99,000 kms', '2,800 kms', '21,000 kms', '11,000 kms',\n", " '66,000 kms', '3,000 kms', '7,000 kms', '38,500 kms', '37,200 kms',\n", " '43,200 kms', '24,800 kms', '45,872 kms', '40,000 kms',\n", " '11,400 kms', '97,200 kms', '52,000 kms', '31,000 kms',\n", " '1,75,430 kms', '37,000 kms', '65,000 kms', '3,350 kms',\n", " '75,000 kms', '62,000 kms', '73,000 kms', '2,200 kms',\n", " '54,870 kms', '34,580 kms', '97,000 kms', '60 kms', '80,200 kms',\n", " '3,200 kms', '0,000 kms', '5,000 kms', '588 kms', '71,200 kms',\n", " '1,75,400 kms', '9,300 kms', '56,758 kms', '10,000 kms',\n", " '56,450 kms', '56,000 kms', '32,700 kms', '9,000 kms', '73 kms',\n", " '1,60,000 kms', '84,000 kms', '58,559 kms', '57,000 kms',\n", " '1,70,000 kms', '80,000 kms', '6,821 kms', '23,000 kms',\n", " '34,000 kms', '1,800 kms', '4,00,000 kms', '48,000 kms',\n", " '90,000 kms', '12,000 kms', '69,900 kms', '1,66,000 kms',\n", " '122 kms', '0 kms', '24,000 kms', '36,469 kms', '7,800 kms',\n", " '24,695 kms', '15,141 kms', '59,910 kms', '1,00,000 kms',\n", " '4,500 kms', '1,29,000 kms', '300 kms', '1,31,000 kms',\n", " '1,11,111 kms', '59,466 kms', '25,500 kms', '44,005 kms',\n", " '2,110 kms', '43,222 kms', '1,00,200 kms', '65 kms',\n", " '1,40,000 kms', '1,03,553 kms', '58,000 kms', '1,20,000 kms',\n", " '49,800 kms', '100 kms', '81,876 kms', '6,020 kms', '55,700 kms',\n", " '18,500 kms', '1,80,000 kms', '53,000 kms', '35,500 kms',\n", " '22,134 kms', '1,000 kms', '8,500 kms', '87,000 kms', '6,000 kms',\n", " '15,574 kms', '8,000 kms', '55,800 kms', '56,400 kms',\n", " '72,160 kms', '11,500 kms', '1,33,000 kms', '2,000 kms',\n", " '88,000 kms', '65,422 kms', '1,17,000 kms', '1,50,000 kms',\n", " '10,750 kms', '6,800 kms', '5 kms', '9,800 kms', '57,923 kms',\n", " '30,201 kms', '6,200 kms', '37,518 kms', '24,652 kms', '383 kms',\n", " '95,000 kms', '3,528 kms', '52,500 kms', '47,900 kms',\n", " '52,800 kms', '1,95,000 kms', '48,008 kms', '48,247 kms',\n", " '9,400 kms', '64,000 kms', '2,137 kms', '10,544 kms', '49,500 kms',\n", " '1,47,000 kms', '90,001 kms', '48,006 kms', '74,000 kms',\n", " '85,000 kms', '29,500 kms', '39,700 kms', '67,000 kms',\n", " '19,336 kms', '60,105 kms', '45,933 kms', '1,02,563 kms',\n", " '28,600 kms', '41,800 kms', '1,16,000 kms', '42,590 kms',\n", " '7,400 kms', '54,500 kms', '76,000 kms', '00 kms', '11,523 kms',\n", " '38,600 kms', '95,500 kms', '37,458 kms', '85,960 kms',\n", " '12,516 kms', '30,600 kms', '2,550 kms', '62,500 kms',\n", " '69,000 kms', '28,400 kms', '68,485 kms', '3,500 kms',\n", " '85,455 kms', '63,000 kms', '1,600 kms', '77,000 kms',\n", " '26,500 kms', '2,875 kms', '13,900 kms', '1,500 kms', '2,450 kms',\n", " '1,625 kms', '33,400 kms', '60,123 kms', '38,900 kms',\n", " '1,37,495 kms', '91,200 kms', '1,46,000 kms', '1,00,800 kms',\n", " '2,100 kms', '2,500 kms', '1,32,000 kms', 'Petrol'], dtype=object)" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "markdown", "source": [ "All numeric data has camma, and kms.\n", "also column has 'Petrol' string" ], "metadata": { "id": "OW2xuYDdOeTa" } }, { "cell_type": "code", "source": [ "car['fuel_type'].unique()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8G0rfwtgOaeJ", "outputId": "562ebc56-52af-46da-9325-1bc708d0aa01" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['Petrol', 'Diesel', nan, 'LPG'], dtype=object)" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "markdown", "source": [ "## Quality of data\n", "\n", "1. year has many non-year values\n", "2. Year object to int\n", "3. price ask for price\n", "4. price object to int\n", "5. kms_driven has kms with intergers\n", "6. kms_driven object to int\n", "7. kms_driven has nan values\n", "8. fuel_type has nan value\n", "9. keep first 3 word of name" ], "metadata": { "id": "b6QNQf33PFZ0" } }, { "cell_type": "markdown", "source": [ "## Cleaning" ], "metadata": { "id": "GWwGOVJ_QGmQ" } }, { "cell_type": "code", "source": [ "backup = car.copy()" ], "metadata": { "id": "iXGXeDukPC2J" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car=car[car['year'].str.isnumeric()]" ], "metadata": { "id": "_MNBGYqUQMvM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car['year']=car['year'].astype(int)" ], "metadata": { "id": "fYHyyMyNQnJj", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "dc46662d-8a98-44e2-f135-c85fda9edc65" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " car['year']=car['year'].astype(int)\n" ] } ] }, { "cell_type": "code", "source": [ "car.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9Al6QL_oQ9e3", "outputId": "8d95c8a8-92a0-4fbd-c847-c1bd056fedda" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Int64Index: 842 entries, 0 to 891\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 name 842 non-null object\n", " 1 company 842 non-null object\n", " 2 year 842 non-null int64 \n", " 3 Price 842 non-null object\n", " 4 kms_driven 840 non-null object\n", " 5 fuel_type 837 non-null object\n", "dtypes: int64(1), object(5)\n", "memory usage: 46.0+ KB\n" ] } ] }, { "cell_type": "code", "source": [ "car.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "O7ivSz7URJIR", "outputId": "1d3282b9-118a-4173-9bc2-3acc5fd54a57" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(842, 6)" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "car=car[car['Price'] != 'Ask For Price']" ], "metadata": { "id": "3JQg_JyRRNL7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car['Price']=car['Price'].str.replace(',','').astype(int)" ], "metadata": { "id": "mA4qN2hvRiXQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car['kms_driven']=car['kms_driven'].str.split(' ').str.get(0).str.replace(',','')" ], "metadata": { "id": "uZ1Dc2d5R6i8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car=car[car['kms_driven'].str.isnumeric()]" ], "metadata": { "id": "4quYafuESjHB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car['kms_driven']=car['kms_driven'].astype(int)" ], "metadata": { "id": "5uYORIqaWXWy" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car=car[~car['fuel_type'].isna()]" ], "metadata": { "id": "RhHcSCgsTVur" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VTzBHi3DT7lS", "outputId": "a8c30b5c-bba9-48e9-f5c7-0da650f8a001" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(816, 6)" ] }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "code", "source": [ "car['name']=car['name'].str.split(' ').str.slice(0,3).str.join(' ')" ], "metadata": { "id": "1qStv-CNVVvB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car.reset_index(drop=True)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "wUzYTdroVjuh", "outputId": "d20835c7-583c-4e75-f9fd-2c64734e5de1" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " name company year Price kms_driven fuel_type\n", "0 Hyundai Santro Xing Hyundai 2007 80000 45000 Petrol\n", "1 Mahindra Jeep CL550 Mahindra 2006 425000 40 Diesel\n", "2 Hyundai Grand i10 Hyundai 2014 325000 28000 Petrol\n", "3 Ford EcoSport Titanium Ford 2014 575000 36000 Diesel\n", "4 Ford Figo Ford 2012 175000 41000 Diesel\n", ".. ... ... ... ... ... ...\n", "811 Maruti Suzuki Ritz Maruti 2011 270000 50000 Petrol\n", "812 Tata Indica V2 Tata 2009 110000 30000 Diesel\n", "813 Toyota Corolla Altis Toyota 2009 300000 132000 Petrol\n", "814 Tata Zest XM Tata 2018 260000 27000 Diesel\n", "815 Mahindra Quanto C8 Mahindra 2013 390000 40000 Diesel\n", "\n", "[816 rows x 6 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecompanyyearPricekms_drivenfuel_type
0Hyundai Santro XingHyundai20078000045000Petrol
1Mahindra Jeep CL550Mahindra200642500040Diesel
2Hyundai Grand i10Hyundai201432500028000Petrol
3Ford EcoSport TitaniumFord201457500036000Diesel
4Ford FigoFord201217500041000Diesel
.....................
811Maruti Suzuki RitzMaruti201127000050000Petrol
812Tata Indica V2Tata200911000030000Diesel
813Toyota Corolla AltisToyota2009300000132000Petrol
814Tata Zest XMTata201826000027000Diesel
815Mahindra Quanto C8Mahindra201339000040000Diesel
\n", "

816 rows × 6 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 22 } ] }, { "cell_type": "code", "source": [ "car.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zCqFEY-_V6Ki", "outputId": "ce435213-a276-4d46-d644-d4f0f23597b3" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Int64Index: 816 entries, 0 to 889\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 name 816 non-null object\n", " 1 company 816 non-null object\n", " 2 year 816 non-null int64 \n", " 3 Price 816 non-null int64 \n", " 4 kms_driven 816 non-null int64 \n", " 5 fuel_type 816 non-null object\n", "dtypes: int64(3), object(3)\n", "memory usage: 44.6+ KB\n" ] } ] }, { "cell_type": "code", "source": [ "car.describe()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 300 }, "id": "97ilwJBOWEWj", "outputId": "8841325e-30cb-47f2-82da-16a2bf9210e4" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " year Price kms_driven\n", "count 816.000000 8.160000e+02 816.000000\n", "mean 2012.444853 4.117176e+05 46275.531863\n", "std 4.002992 4.751844e+05 34297.428044\n", "min 1995.000000 3.000000e+04 0.000000\n", "25% 2010.000000 1.750000e+05 27000.000000\n", "50% 2013.000000 2.999990e+05 41000.000000\n", "75% 2015.000000 4.912500e+05 56818.500000\n", "max 2019.000000 8.500003e+06 400000.000000" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearPricekms_driven
count816.0000008.160000e+02816.000000
mean2012.4448534.117176e+0546275.531863
std4.0029924.751844e+0534297.428044
min1995.0000003.000000e+040.000000
25%2010.0000001.750000e+0527000.000000
50%2013.0000002.999990e+0541000.000000
75%2015.0000004.912500e+0556818.500000
max2019.0000008.500003e+06400000.000000
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "car=car[car['Price']<6e6]" ], "metadata": { "id": "I6ZlcuaYWn_4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "car.to_csv('cleaned car.csv')" ], "metadata": { "id": "OSATadwKngdW" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Model" ], "metadata": { "id": "cY-nLpwXoz09" } }, { "cell_type": "code", "source": [ "x=car.drop(columns='Price')\n", "y=car.Price" ], "metadata": { "id": "KGCSWc8HoeGZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "x" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "as2r7WYxpFs5", "outputId": "ead73d06-997f-4c07-998a-6e6dd74f1761" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " name company year kms_driven fuel_type\n", "0 Hyundai Santro Xing Hyundai 2007 45000 Petrol\n", "1 Mahindra Jeep CL550 Mahindra 2006 40 Diesel\n", "3 Hyundai Grand i10 Hyundai 2014 28000 Petrol\n", "4 Ford EcoSport Titanium Ford 2014 36000 Diesel\n", "6 Ford Figo Ford 2012 41000 Diesel\n", ".. ... ... ... ... ...\n", "883 Maruti Suzuki Ritz Maruti 2011 50000 Petrol\n", "885 Tata Indica V2 Tata 2009 30000 Diesel\n", "886 Toyota Corolla Altis Toyota 2009 132000 Petrol\n", "888 Tata Zest XM Tata 2018 27000 Diesel\n", "889 Mahindra Quanto C8 Mahindra 2013 40000 Diesel\n", "\n", "[815 rows x 5 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecompanyyearkms_drivenfuel_type
0Hyundai Santro XingHyundai200745000Petrol
1Mahindra Jeep CL550Mahindra200640Diesel
3Hyundai Grand i10Hyundai201428000Petrol
4Ford EcoSport TitaniumFord201436000Diesel
6Ford FigoFord201241000Diesel
..................
883Maruti Suzuki RitzMaruti201150000Petrol
885Tata Indica V2Tata200930000Diesel
886Toyota Corolla AltisToyota2009132000Petrol
888Tata Zest XMTata201827000Diesel
889Mahindra Quanto C8Mahindra201340000Diesel
\n", "

815 rows × 5 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 28 } ] }, { "cell_type": "code", "source": [ "y" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EYo6tsKQpIPz", "outputId": "8a5237c2-a990-4df9-bfb6-c86013be1092" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 80000\n", "1 425000\n", "3 325000\n", "4 575000\n", "6 175000\n", " ... \n", "883 270000\n", "885 110000\n", "886 300000\n", "888 260000\n", "889 390000\n", "Name: Price, Length: 815, dtype: int64" ] }, "metadata": {}, "execution_count": 29 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=1)" ], "metadata": { "id": "GU4fNOrzpLnE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import r2_score\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.compose import make_column_transformer\n", "from sklearn.pipeline import make_pipeline" ], "metadata": { "id": "nZB719Azps7t" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "ohe = OneHotEncoder()\n", "\n", "ohe.fit(x[['name','company','fuel_type']])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "-3CjyNTrqDq0", "outputId": "13aa2a5a-5430-46bf-c9bc-80bc5b173237" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "OneHotEncoder()" ], "text/html": [ "
OneHotEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 32 } ] }, { "cell_type": "code", "source": [ "column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']), remainder='passthrough')" ], "metadata": { "id": "Lz5Oa5CVryOD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "lr=LinearRegression()" ], "metadata": { "id": "h2-fyidUsBgq" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "pipe=make_pipeline(column_trans,lr)" ], "metadata": { "id": "ItK-P-f_uPL4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "pipe.fit(x_train,y_train)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 192 }, "id": "lFvb7w-hulX3", "outputId": "d755c75f-4aec-4f90-cd4e-c096b17e002a" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(remainder='passthrough',\n", " transformers=[('onehotencoder',\n", " OneHotEncoder(categories=[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',\n", " 'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',\n", " 'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',\n", " 'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat...\n", " array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',\n", " 'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Jeep', 'Land',\n", " 'Mahindra', 'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan',\n", " 'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],\n", " dtype=object),\n", " array(['Diesel', 'LPG', 'Petrol'], dtype=object)]),\n", " ['name', 'company',\n", " 'fuel_type'])])),\n", " ('linearregression', LinearRegression())])" ], "text/html": [ "
Pipeline(steps=[('columntransformer',\n",
              "                 ColumnTransformer(remainder='passthrough',\n",
              "                                   transformers=[('onehotencoder',\n",
              "                                                  OneHotEncoder(categories=[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',\n",
              "       'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',\n",
              "       'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',\n",
              "       'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat...\n",
              "                                                                            array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',\n",
              "       'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Jeep', 'Land',\n",
              "       'Mahindra', 'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan',\n",
              "       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],\n",
              "      dtype=object),\n",
              "                                                                            array(['Diesel', 'LPG', 'Petrol'], dtype=object)]),\n",
              "                                                  ['name', 'company',\n",
              "                                                   'fuel_type'])])),\n",
              "                ('linearregression', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 41 } ] }, { "cell_type": "code", "source": [ "y_pred=pipe.predict(x_test)" ], "metadata": { "id": "sGdoVcQ7uyL7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "r2_score(y_test,y_pred)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "w8Tj31iru6v-", "outputId": "6f411965-f819-45f4-f97d-28fcabaf1f2f" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.4786019553698676" ] }, "metadata": {}, "execution_count": 44 } ] }, { "cell_type": "code", "source": [ "r2_scores= []\n", "random_i=[]\n", "for i in range(500):\n", " x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=i)\n", " lr=LinearRegression()\n", " pipe=make_pipeline(column_trans,lr)\n", " pipe.fit(x_train,y_train)\n", " y_pred=pipe.predict(x_test)\n", " r2_scores.append(r2_score(y_test,y_pred))\n", " random_i.append(i)" ], "metadata": { "id": "Kdo3Pd9Wv2EE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "random_data= pd.DataFrame({'Score':r2_scores,'Random_value':random_i})" ], "metadata": { "id": "IO3IvSfVxsBj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "np.argmax(r2_scores)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fChp9sltzzpj", "outputId": "3077a482-b60c-43da-a3f9-3b1f0fe1998c" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "433" ] }, "metadata": {}, "execution_count": 54 } ] }, { "cell_type": "code", "source": [ "r2_scores[433]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bWaARfLmz6Yh", "outputId": "93925ab2-dffc-40d2-d1e2-87866b8b51bf" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8456515104452564" ] }, "metadata": {}, "execution_count": 55 } ] }, { "cell_type": "code", "source": [ "x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=np.argmax(r2_scores))\n", "lr=LinearRegression()\n", "pipe=make_pipeline(column_trans,lr)\n", "pipe.fit(x_train,y_train)\n", "y_pred=pipe.predict(x_test)\n", "r2_score(y_test,y_pred)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JUb6oA4x0dTR", "outputId": "759bdbd3-ccc8-43c6-85f3-5d8ea4047482" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8456515104452564" ] }, "metadata": {}, "execution_count": 56 } ] }, { "cell_type": "code", "source": [ "import pickle" ], "metadata": { "id": "R4j2B3Xx7Bm_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "pickle.dump(pipe,open('LinearRegressionModel.pkl','wb'))" ], "metadata": { "id": "JOTFnco_7H65" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "pipe.predict(pd.DataFrame([['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']],columns=['name','company','year','kms_driven','fuel_type']))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "L6PM14NG7ZWI", "outputId": "bc568fdb-bb40-436c-c6ce-caf4e9e19026" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([459113.49353657])" ] }, "metadata": {}, "execution_count": 66 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "UKYqrC2n8ZzO" }, "execution_count": null, "outputs": [] } ] }