{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt \n", "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Customer segmentation for targeted marketing campaign\n", "\n", "https://www.kaggle.com/datasets/imakash3011/customer-personality-analysis\n", "\n", "**People**\n", "- ID: Customer's unique identifier\n", "- Year_Birth: Customer's birth year\n", "- Education: Customer's education level\n", "- Marital_Status: Customer's marital status\n", "- Income: Customer's yearly household income\n", "- Kidhome: Number of children in customer's household\n", "- Teenhome: Number of teenagers in customer's household\n", "- Dt_Customer: Date of customer's enrollment with the company\n", "- Recency: Number of days since customer's last purchase\n", "- Complain: 1 if the customer complained in the last 2 years, 0 otherwise\n", "\n", "**Products**\n", "- MntWines: Amount spent on wine in last 2 years\n", "- MntFruits: Amount spent on fruits in last 2 years\n", "- MntMeatProducts: Amount spent on meat in last 2 years\n", "- MntFishProducts: Amount spent on fish in last 2 years\n", "- MntSweetProducts: Amount spent on sweets in last 2 years\n", "- MntGoldProds: Amount spent on gold in last 2 years\n", "\n", "**Promotion**\n", "- NumDealsPurchases: Number of purchases made with a discount\n", "- AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise\n", "- AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise\n", "- AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise\n", "- AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise\n", "- AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise\n", "- Response: 1 if customer accepted the offer in the last campaign, 0 otherwise\n", "\n", "**Place**\n", "- NumWebPurchases: Number of purchases made through the company’s website\n", "- NumCatalogPurchases: Number of purchases made using a catalogue\n", "- NumStorePurchases: Number of purchases made directly in stores\n", "- NumWebVisitsMonth: Number of visits to company’s website in the last month" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Cleaning" ] }, { "cell_type": "code", "execution_count": 1363, "metadata": {}, "outputs": [], "source": [ "# Load dataset\n", "path_data_marketing = r\"C:\\Users\\LaurèneDAVID\\Documents\\Teaching\\Educational_apps\\app-hec-AI-DS\\data\\clustering\\marketing_campaign.csv\"\n", "marketing_data = pd.read_csv(path_data_marketing, sep=\";\")" ] }, { "cell_type": "code", "execution_count": 1364, "metadata": {}, "outputs": [], "source": [ "# Delete columns\n", "marketing_data.drop(columns=['ID','MntGoldProds','Response','Complain','AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2',\n", " 'Z_CostContact', 'Z_Revenue'], inplace=True)\n", "\n", "#marketing_data = marketing_data.loc[marketing_data[\"Marital_Status\"].isin([\"Single\",\"Married\",\"Divorced\"])]\n", "marketing_data.drop(columns=[\"Marital_Status\"], inplace=True)\n", "\n", "# marketing_data = marketing_data.loc[marketing_data[\"Education\"].isin([\"2n Cycle\",\"Graduation\",\"Master\",\"PhD\"])]\n", "marketing_data.drop(columns=[\"Education\"],inplace=True)\n", "\n", "marketing_data = marketing_data[marketing_data[\"Income\"]>5000]" ] }, { "cell_type": "code", "execution_count": 1365, "metadata": {}, "outputs": [], "source": [ "# Change column names\n", "new_columns = [col.replace(\"Mnt\",\"\").replace(\"Num\",\"\") for col in list(marketing_data.columns)]\n", "new_columns = [col + \"Products\" if col in [\"Wines\",\"Fruits\"] else col for col in new_columns]\n", "marketing_data.columns = new_columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Preprocessing" ] }, { "cell_type": "code", "execution_count": 1366, "metadata": {}, "outputs": [], "source": [ "# Proportion of a customer's income spent on wines, fruits, ...\n", "products_col = [\"WinesProducts\",\"FruitsProducts\", \"MeatProducts\",\"FishProducts\",\"SweetProducts\"]\n", "total_amount_spent = marketing_data[products_col].sum(axis=1)\n", "\n", "for col in products_col:\n", " marketing_data[col] = (100*marketing_data[col] / total_amount_spent).round(1)" ] }, { "cell_type": "code", "execution_count": 1367, "metadata": {}, "outputs": [], "source": [ "# Proportion of web, catalog and store purchases (based on total number of purchases)\n", "purchases_col = [\"WebPurchases\", \"CatalogPurchases\", \"StorePurchases\"]\n", "total_purchases = marketing_data[purchases_col].sum(axis=1)\n", "\n", "for col in purchases_col:\n", " marketing_data[col] = (100*marketing_data[col] / total_purchases).round(1)" ] }, { "cell_type": "code", "execution_count": 1368, "metadata": {}, "outputs": [], "source": [ "from datetime import datetime, date\n", "\n", "def get_number_days(input_date):\n", " date1 = datetime.strptime(input_date, '%d/%m/%Y').date()\n", " date2 = date(2022, 2, 13)\n", " return (date2 - date1).days" ] }, { "cell_type": "code", "execution_count": 1369, "metadata": {}, "outputs": [], "source": [ "# Compute a customer's age, based on year of birth\n", "marketing_data.insert(0, \"Age\", marketing_data[\"Year_Birth\"].apply(lambda x: 2023-x))\n", "\n", "# Compute the number of days a customer has been subscribed \n", "marketing_data.insert(1, \"Days_subscription\", marketing_data[\"Dt_Customer\"].apply(get_number_days))\n", "\n", "# Compute total number of kids (kids + teens)\n", "marketing_data[\"Kids\"] = marketing_data[\"Kidhome\"] + marketing_data[\"Teenhome\"]\n", "marketing_data.drop(columns=[\"Kidhome\",\"Teenhome\"], inplace=True)\n", "\n", "marketing_data.drop(columns=[\"Year_Birth\", \"Dt_Customer\"], inplace=True)\n", "marketing_data.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 1370, "metadata": {}, "outputs": [], "source": [ "path_cleandata = r\"C:\\Users\\LaurèneDAVID\\Documents\\Teaching\\Educational_apps\\app-hec-AI-DS\\data\\clustering\"\n", "marketing_data.to_pickle(os.path.join(path_cleandata,\"clean_marketing.pkl\"))" ] }, { "cell_type": "code", "execution_count": 1371, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Age | \n", "Days_subscription | \n", "Income | \n", "Recency | \n", "WinesProducts | \n", "FruitsProducts | \n", "MeatProducts | \n", "FishProducts | \n", "SweetProducts | \n", "DealsPurchases | \n", "WebPurchases | \n", "CatalogPurchases | \n", "StorePurchases | \n", "WebVisitsMonth | \n", "Kids | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "66 | \n", "3449 | \n", "58138.0 | \n", "58 | \n", "41.5 | \n", "5.8 | \n", "35.7 | \n", "11.2 | \n", "5.8 | \n", "3 | \n", "36.4 | \n", "45.5 | \n", "18.2 | \n", "7 | \n", "0 | \n", "
1 | \n", "69 | \n", "2899 | \n", "46344.0 | \n", "38 | \n", "52.4 | \n", "4.8 | \n", "28.6 | \n", "9.5 | \n", "4.8 | \n", "2 | \n", "25.0 | \n", "25.0 | \n", "50.0 | \n", "5 | \n", "2 | \n", "
2 | \n", "58 | \n", "3098 | \n", "71613.0 | \n", "26 | \n", "58.0 | \n", "6.7 | \n", "17.3 | \n", "15.1 | \n", "2.9 | \n", "1 | \n", "40.0 | \n", "10.0 | \n", "50.0 | \n", "4 | \n", "0 | \n", "
3 | \n", "39 | \n", "2925 | \n", "26646.0 | \n", "26 | \n", "22.9 | \n", "8.3 | \n", "41.7 | \n", "20.8 | \n", "6.2 | \n", "2 | \n", "33.3 | \n", "0.0 | \n", "66.7 | \n", "6 | \n", "1 | \n", "
4 | \n", "42 | \n", "2947 | \n", "58293.0 | \n", "94 | \n", "42.5 | \n", "10.6 | \n", "29.0 | \n", "11.3 | \n", "6.6 | \n", "5 | \n", "35.7 | \n", "21.4 | \n", "42.9 | \n", "5 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2235 | \n", "56 | \n", "3167 | \n", "61223.0 | \n", "46 | \n", "64.8 | \n", "3.9 | \n", "16.6 | \n", "3.8 | \n", "10.8 | \n", "2 | \n", "56.2 | \n", "18.8 | \n", "25.0 | \n", "5 | \n", "1 | \n", "
2236 | \n", "77 | \n", "2805 | \n", "64014.0 | \n", "56 | \n", "93.1 | \n", "0.0 | \n", "6.9 | \n", "0.0 | \n", "0.0 | \n", "7 | \n", "53.3 | \n", "13.3 | \n", "33.3 | \n", "7 | \n", "3 | \n", "
2237 | \n", "42 | \n", "2941 | \n", "56981.0 | \n", "91 | \n", "74.6 | \n", "3.9 | \n", "17.8 | \n", "2.6 | \n", "1.0 | \n", "1 | \n", "11.1 | \n", "16.7 | \n", "72.2 | \n", "6 | \n", "0 | \n", "
2238 | \n", "67 | \n", "2942 | \n", "69245.0 | \n", "8 | \n", "54.7 | \n", "3.8 | \n", "27.4 | \n", "10.2 | \n", "3.8 | \n", "2 | \n", "28.6 | \n", "23.8 | \n", "47.6 | \n", "3 | \n", "1 | \n", "
2239 | \n", "69 | \n", "3408 | \n", "52869.0 | \n", "40 | \n", "55.6 | \n", "2.0 | \n", "40.4 | \n", "1.3 | \n", "0.7 | \n", "3 | \n", "37.5 | \n", "12.5 | \n", "50.0 | \n", "7 | \n", "2 | \n", "
2208 rows × 15 columns
\n", "