{ "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "bc9fba0f", "metadata": {}, "source": [ "# DataFrame for EU regional data" ] }, { "cell_type": "code", "execution_count": null, "id": "0e470150", "metadata": {}, "outputs": [], "source": [ "import os, sys\n", "sys.path.insert(1, os.path.abspath('..'))\n", "\n", "from stat_mod import *\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "id": "b380d28d-7e6e-4f5b-82e3-e1d9d2ec214a", "metadata": {}, "outputs": [], "source": [ "# Create dicts with EU NUTS 2 regions\n", "eu_regions = { 'AT': 'Western Europe',\n", "'BE': 'Western Europe', 'FR': 'Western Europe', 'DE': 'Western Europe',\n", "'IE': 'Western Europe', 'LU': 'Western Europe', 'NL': 'Western Europe',\n", "'CY': 'Southern Europe', 'EL': 'Southern Europe', 'IT': 'Southern Europe',\n", "'MT': 'Southern Europe', 'PT': 'Southern Europe', 'ES': 'Southern Europe',\n", "'DK': 'Northern Europe', 'EE': 'Northern Europe', 'FI': 'Northern Europe',\n", "'LV': 'Northern Europe', 'LT': 'Northern Europe', 'SE': 'Northern Europe',\n", "'BG': 'Central and Eastern Europe', 'HR': 'Central and Eastern Europe',\n", "'CZ': 'Central and Eastern Europe', 'RO': 'Central and Eastern Europe',\n", "'SK': 'Central and Eastern Europe', 'SI': 'Central and Eastern Europe',\n", "'PL': 'Central and Eastern Europe', 'HU': 'Central and Eastern Europe' }\n", "\n", "regions = {}\n", "for item in countries.values():\n", " regions.update(nuts_codes[item])" ] }, { "cell_type": "code", "execution_count": 3, "id": "dd5d63ab-9317-4aa0-a366-7f412cfd4cf4", "metadata": {}, "outputs": [], "source": [ "# Regional \"GDP\"\n", "def get_gdp_region():\n", " params = {'unit': 'MIO_EUR', 'geo': list(regions.keys()), 'time': 2020}\n", " df = client.get_dataset('nama_10r_2gdp', params).to_dataframe()\n", " \n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df['Country'] = df['geo'].str[:2]\n", " df['EU Region'] = df['Country'].apply(lambda x: eu_regions[x])\n", " df.set_index('region_name', inplace=True) \n", " df.rename(columns = {'values': 'GDP'}, inplace = True)\n", " df['GDP'] = df['GDP'] / 1000\n", " cols = ['Country', 'EU Region', 'GDP',]\n", " \n", " return df[cols]" ] }, { "cell_type": "code", "execution_count": 4, "id": "ff71efcc-e4a6-4dce-9881-f431ddda628f", "metadata": {}, "outputs": [], "source": [ "# Regional \"GDP\" per capita\n", "def get_gdp_capita_region():\n", " params = {'unit': 'EUR_HAB', 'time': 2020,\n", " 'geo': list(regions.keys())}\n", " df = client.get_dataset('nama_10r_2gdp', params).to_dataframe()\n", "\n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.set_index('region_name', inplace=True) \n", " df.rename(columns = {'values': 'GDP per Capita'}, inplace = True)\n", " df = df[['GDP per Capita']]\n", " \n", " return df" ] }, { "cell_type": "code", "execution_count": 5, "id": "7c9a96ad-052a-49b0-b364-cea1adb39312", "metadata": {}, "outputs": [], "source": [ "# Regional unemployment\n", "def get_unemployment_region():\n", " params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2021,\n", " 'age': 'Y15-74', 'isced11': 'TOTAL'} \n", " df = client.get_dataset('lfst_r_lfu3rt', params).to_dataframe()\n", "\n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.rename(columns = {'values': 'Unemployment %'}, inplace = True)\n", " df.set_index('region_name', inplace=True) \n", " df = df[['Unemployment %']]\n", " \n", " return df\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "d88a8821-1ff9-4081-90a5-710b11b03e1a", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Life expectancy\n", "def get_life_expectancy():\n", " params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2020}\n", " df = client.get_dataset('tgs00101', params).to_dataframe()\n", "\n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.rename(columns = {'values': 'Life Expectancy'}, inplace = True)\n", " df.set_index('region_name', inplace=True) \n", " df = df[['Life Expectancy']]\n", " \n", " return df" ] }, { "cell_type": "code", "execution_count": 7, "id": "8d9569f3-7ce7-429e-b22a-4abf7a5032bb", "metadata": {}, "outputs": [], "source": [ "# Get tertiary Educational attainment\n", "def get_tertiary_education():\n", " params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2020}\n", " df = client.get_dataset('tgs00109', params).to_dataframe()\n", " \n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.rename(columns = {'values': 'Tertiary Educational Attainment %'}, inplace = True)\n", " df.set_index('region_name', inplace=True) \n", " df = df[['Tertiary Educational Attainment %']]\n", " \n", " return df" ] }, { "cell_type": "code", "execution_count": 8, "id": "2206a2e3-4e77-4725-8bc5-b84978e52dcc", "metadata": {}, "outputs": [], "source": [ "# Population density\n", "def get_population_density():\n", " params = {'geo': list(regions.keys()), 'time': 2019}\n", " df = client.get_dataset('tgs00024', params).to_dataframe()\n", "\n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.rename(columns = {'values': 'Population Density'}, inplace = True)\n", " df.set_index('region_name', inplace=True) \n", " df = df[['Population Density']]\n", " \n", " return df\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "2eb66c5f-229b-41c4-adf3-db0ed0cb9589", "metadata": {}, "outputs": [], "source": [ "# Poverty Risk\n", "def get_poverty_risk():\n", " params = {'geo': list(regions.keys()), 'time': 2019}\n", " df = client.get_dataset('ilc_peps11', params).to_dataframe()\n", " \n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.rename(columns = {'values': 'People at Risk of Poverty %'}, inplace = True)\n", " df.set_index('region_name', inplace=True) \n", " df = df[['People at Risk of Poverty %']]\n", " \n", " return df\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "757f2be5-2e42-4075-bda1-294d238f5aed", "metadata": {}, "outputs": [], "source": [ "# Regional availability of doctors\n", "def get_doctors():\n", " params = {'geo': list(regions.keys()), 'time': 2019,'unit': 'P_HTHAB',\n", " 'isco08': 'OC221' }\n", " df = client.get_dataset('hlth_rs_prsrg', params).to_dataframe()\n", "\n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.rename(columns = {'values': 'Doctors per 100000'}, inplace = True)\n", " df.set_index('region_name', inplace=True) \n", " df = df[['Doctors per 100000']]\n", " \n", " return df\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "f8c52f1a-a7ca-4a0c-8b29-fa22ecc32f47", "metadata": {}, "outputs": [], "source": [ "# Get deaths in road accidents\n", "def get_fatal_road_accidents():\n", " params = {'victim': 'KIL', 'geo': list(regions.keys()), 'time': 2020,\n", " 'unit': 'P_MHAB'}\n", " df = client.get_dataset('tran_r_acci', params).to_dataframe()\n", "\n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.rename(columns = {'values': 'Fatal Road Accidents per Million'}, inplace = True)\n", " df.set_index('region_name', inplace=True) \n", " df = df[['Fatal Road Accidents per Million']]\n", " \n", " return df\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "efc48ba6-062d-4e0a-86c1-0df22c6238ff", "metadata": {}, "outputs": [], "source": [ "# Regular Internet Users\n", "\n", "def get_regular_internet_users():\n", " params = {'indic_is': 'I_IDAY', 'geo': list(regions.keys()), 'time': 2021,\n", " 'unit': 'PC_IND'}\n", " df = client.get_dataset('isoc_r_iuse_i', params).to_dataframe()\n", "\n", " # Remove NAs\n", " df.dropna(inplace = True)\n", " df['region_name'] = df['geo'].apply(lambda x: regions[x])\n", " df.rename(columns = {'values': 'Regular Internet Users %'}, inplace = True)\n", " df.set_index('region_name', inplace=True) \n", " df = df[['Regular Internet Users %']]\n", " \n", " return df\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "d7b173c5-aa6c-4b21-b1a1-c642b18184ee", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Get all data\n", "df = get_gdp_region()\n", "df = df.join(get_gdp_capita_region())\n", "df = df.join(get_unemployment_region())\n", "df = df.join(get_life_expectancy())\n", "df = df.join(get_doctors())\n", "df = df.join(get_fatal_road_accidents())\n", "df = df.join(get_tertiary_education())\n", "df = df.join(get_population_density())\n", "df = df.join(get_poverty_risk())\n", "df = df.join(get_regular_internet_users())\n", "\n", "# Remove NAs\n", "df.dropna(thresh = 4, inplace = True)" ] }, { "cell_type": "code", "execution_count": 14, "id": "478f421c-49e8-4334-ad2d-65cafd5b380f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 242 entries, Abruzzo to Южен централен (Yuzhen tsentralen)\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Country 242 non-null object \n", " 1 EU Region 242 non-null object \n", " 2 GDP 242 non-null float64\n", " 3 GDP per Capita 242 non-null float64\n", " 4 Unemployment % 238 non-null float64\n", " 5 Life Expectancy 238 non-null float64\n", " 6 Doctors per 100000 173 non-null float64\n", " 7 Fatal Road Accidents per Million 239 non-null float64\n", " 8 Tertiary Educational Attainment % 238 non-null float64\n", " 9 Population Density 239 non-null float64\n", " 10 People at Risk of Poverty % 183 non-null float64\n", " 11 Regular Internet Users % 169 non-null float64\n", "dtypes: float64(10), object(2)\n", "memory usage: 24.6+ KB\n" ] } ], "source": [ "# Check cols\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 15, "id": "9a335ba4-330e-49a4-bec4-bf37dc4c2e50", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Save to CSV\n", "df.to_csv('../data/eu_regional_data.csv',\n", " float_format = '%.2f', encoding = 'utf-8')" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "vscode": { "interpreter": { "hash": "9737f18efb1b9d105cd953add47348769fd82af91ada1884766527cfb8bc009e" } } }, "nbformat": 4, "nbformat_minor": 5 }