{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "bc9fba0f",
   "metadata": {},
   "source": [
    "# DataFrame for EU regional data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e470150",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, sys\n",
    "sys.path.insert(1, os.path.abspath('..'))\n",
    "\n",
    "from stat_mod import *\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b380d28d-7e6e-4f5b-82e3-e1d9d2ec214a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dicts with EU NUTS 2 regions\n",
    "eu_regions = { 'AT': 'Western Europe',\n",
    "'BE': 'Western Europe', 'FR': 'Western Europe', 'DE': 'Western Europe',\n",
    "'IE': 'Western Europe', 'LU': 'Western Europe', 'NL': 'Western Europe',\n",
    "'CY': 'Southern Europe', 'EL': 'Southern Europe', 'IT': 'Southern Europe',\n",
    "'MT': 'Southern Europe', 'PT': 'Southern Europe', 'ES': 'Southern Europe',\n",
    "'DK': 'Northern Europe', 'EE': 'Northern Europe', 'FI': 'Northern Europe',\n",
    "'LV': 'Northern Europe', 'LT': 'Northern Europe', 'SE': 'Northern Europe',\n",
    "'BG': 'Central and Eastern Europe', 'HR': 'Central and Eastern Europe',\n",
    "'CZ': 'Central and Eastern Europe', 'RO': 'Central and Eastern Europe',\n",
    "'SK': 'Central and Eastern Europe', 'SI': 'Central and Eastern Europe',\n",
    "'PL': 'Central and Eastern Europe', 'HU': 'Central and Eastern Europe' }\n",
    "\n",
    "regions = {}\n",
    "for item in countries.values():\n",
    "    regions.update(nuts_codes[item])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "dd5d63ab-9317-4aa0-a366-7f412cfd4cf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regional \"GDP\"\n",
    "def get_gdp_region():\n",
    "    params = {'unit': 'MIO_EUR', 'geo': list(regions.keys()), 'time': 2020}\n",
    "    df = client.get_dataset('nama_10r_2gdp', params).to_dataframe()\n",
    "    \n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df['Country'] = df['geo'].str[:2]\n",
    "    df['EU Region'] = df['Country'].apply(lambda x: eu_regions[x])\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df.rename(columns = {'values': 'GDP'}, inplace = True)\n",
    "    df['GDP'] = df['GDP'] / 1000\n",
    "    cols = ['Country', 'EU Region', 'GDP',]\n",
    "    \n",
    "    return df[cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ff71efcc-e4a6-4dce-9881-f431ddda628f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regional \"GDP\" per capita\n",
    "def get_gdp_capita_region():\n",
    "    params = {'unit': 'EUR_HAB', 'time': 2020,\n",
    "              'geo': list(regions.keys())}\n",
    "    df = client.get_dataset('nama_10r_2gdp', params).to_dataframe()\n",
    "\n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df.rename(columns = {'values': 'GDP per Capita'}, inplace = True)\n",
    "    df = df[['GDP per Capita']]\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7c9a96ad-052a-49b0-b364-cea1adb39312",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regional unemployment\n",
    "def get_unemployment_region():\n",
    "    params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2021,\n",
    "              'age': 'Y15-74', 'isced11': 'TOTAL'} \n",
    "    df = client.get_dataset('lfst_r_lfu3rt', params).to_dataframe()\n",
    "\n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.rename(columns = {'values': 'Unemployment %'}, inplace = True)\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df = df[['Unemployment %']]\n",
    "    \n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d88a8821-1ff9-4081-90a5-710b11b03e1a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Life expectancy\n",
    "def get_life_expectancy():\n",
    "    params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2020}\n",
    "    df = client.get_dataset('tgs00101', params).to_dataframe()\n",
    "\n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.rename(columns = {'values': 'Life Expectancy'}, inplace = True)\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df = df[['Life Expectancy']]\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8d9569f3-7ce7-429e-b22a-4abf7a5032bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get tertiary Educational attainment\n",
    "def get_tertiary_education():\n",
    "    params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2020}\n",
    "    df = client.get_dataset('tgs00109', params).to_dataframe()\n",
    "    \n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.rename(columns = {'values': 'Tertiary Educational Attainment %'}, inplace = True)\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df = df[['Tertiary Educational Attainment %']]\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2206a2e3-4e77-4725-8bc5-b84978e52dcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Population density\n",
    "def get_population_density():\n",
    "    params = {'geo': list(regions.keys()), 'time': 2019}\n",
    "    df = client.get_dataset('tgs00024', params).to_dataframe()\n",
    "\n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.rename(columns = {'values': 'Population Density'}, inplace = True)\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df = df[['Population Density']]\n",
    "    \n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2eb66c5f-229b-41c4-adf3-db0ed0cb9589",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Poverty Risk\n",
    "def get_poverty_risk():\n",
    "    params = {'geo': list(regions.keys()), 'time': 2019}\n",
    "    df = client.get_dataset('ilc_peps11', params).to_dataframe()\n",
    "    \n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.rename(columns = {'values': 'People at Risk of Poverty %'}, inplace = True)\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df = df[['People at Risk of Poverty %']]\n",
    "    \n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "757f2be5-2e42-4075-bda1-294d238f5aed",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regional availability of doctors\n",
    "def get_doctors():\n",
    "    params = {'geo': list(regions.keys()), 'time': 2019,'unit': 'P_HTHAB',\n",
    "              'isco08': 'OC221' }\n",
    "    df = client.get_dataset('hlth_rs_prsrg', params).to_dataframe()\n",
    "\n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.rename(columns = {'values': 'Doctors per 100000'}, inplace = True)\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df = df[['Doctors per 100000']]\n",
    "    \n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f8c52f1a-a7ca-4a0c-8b29-fa22ecc32f47",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get deaths in road accidents\n",
    "def get_fatal_road_accidents():\n",
    "    params = {'victim': 'KIL', 'geo': list(regions.keys()), 'time': 2020,\n",
    "              'unit': 'P_MHAB'}\n",
    "    df = client.get_dataset('tran_r_acci', params).to_dataframe()\n",
    "\n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.rename(columns = {'values': 'Fatal Road Accidents per Million'}, inplace = True)\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df = df[['Fatal Road Accidents per Million']]\n",
    "    \n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "efc48ba6-062d-4e0a-86c1-0df22c6238ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Regular Internet Users\n",
    "\n",
    "def get_regular_internet_users():\n",
    "    params = {'indic_is': 'I_IDAY', 'geo': list(regions.keys()), 'time': 2021,\n",
    "              'unit': 'PC_IND'}\n",
    "    df = client.get_dataset('isoc_r_iuse_i', params).to_dataframe()\n",
    "\n",
    "    # Remove NAs\n",
    "    df.dropna(inplace = True)\n",
    "    df['region_name'] = df['geo'].apply(lambda x: regions[x])\n",
    "    df.rename(columns = {'values': 'Regular Internet Users %'}, inplace = True)\n",
    "    df.set_index('region_name', inplace=True) \n",
    "    df = df[['Regular Internet Users %']]\n",
    "    \n",
    "    return df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d7b173c5-aa6c-4b21-b1a1-c642b18184ee",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Get all data\n",
    "df = get_gdp_region()\n",
    "df = df.join(get_gdp_capita_region())\n",
    "df = df.join(get_unemployment_region())\n",
    "df = df.join(get_life_expectancy())\n",
    "df = df.join(get_doctors())\n",
    "df = df.join(get_fatal_road_accidents())\n",
    "df = df.join(get_tertiary_education())\n",
    "df = df.join(get_population_density())\n",
    "df = df.join(get_poverty_risk())\n",
    "df = df.join(get_regular_internet_users())\n",
    "\n",
    "# Remove NAs\n",
    "df.dropna(thresh = 4, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "478f421c-49e8-4334-ad2d-65cafd5b380f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 242 entries, Abruzzo to Южен централен (Yuzhen tsentralen)\n",
      "Data columns (total 12 columns):\n",
      " #   Column                             Non-Null Count  Dtype  \n",
      "---  ------                             --------------  -----  \n",
      " 0   Country                            242 non-null    object \n",
      " 1   EU Region                          242 non-null    object \n",
      " 2   GDP                                242 non-null    float64\n",
      " 3   GDP per Capita                     242 non-null    float64\n",
      " 4   Unemployment %                     238 non-null    float64\n",
      " 5   Life Expectancy                    238 non-null    float64\n",
      " 6   Doctors per 100000                 173 non-null    float64\n",
      " 7   Fatal Road Accidents per Million   239 non-null    float64\n",
      " 8   Tertiary Educational Attainment %  238 non-null    float64\n",
      " 9   Population Density                 239 non-null    float64\n",
      " 10  People at Risk of Poverty %        183 non-null    float64\n",
      " 11  Regular Internet Users %           169 non-null    float64\n",
      "dtypes: float64(10), object(2)\n",
      "memory usage: 24.6+ KB\n"
     ]
    }
   ],
   "source": [
    "# Check cols\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9a335ba4-330e-49a4-bec4-bf37dc4c2e50",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Save to CSV\n",
    "df.to_csv('../data/eu_regional_data.csv',\n",
    "          float_format = '%.2f', encoding = 'utf-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "vscode": {
   "interpreter": {
    "hash": "9737f18efb1b9d105cd953add47348769fd82af91ada1884766527cfb8bc009e"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}