# DataFrame for EU regional data

In [None]:
import os, sys
sys.path.insert(1, os.path.abspath('..'))

from stat_mod import *
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Create dicts with EU NUTS 2 regions
eu_regions = { 'AT': 'Western Europe',
'BE': 'Western Europe', 'FR': 'Western Europe', 'DE': 'Western Europe',
'IE': 'Western Europe', 'LU': 'Western Europe', 'NL': 'Western Europe',
'CY': 'Southern Europe', 'EL': 'Southern Europe', 'IT': 'Southern Europe',
'MT': 'Southern Europe', 'PT': 'Southern Europe', 'ES': 'Southern Europe',
'DK': 'Northern Europe', 'EE': 'Northern Europe', 'FI': 'Northern Europe',
'LV': 'Northern Europe', 'LT': 'Northern Europe', 'SE': 'Northern Europe',
'BG': 'Central and Eastern Europe', 'HR': 'Central and Eastern Europe',
'CZ': 'Central and Eastern Europe', 'RO': 'Central and Eastern Europe',
'SK': 'Central and Eastern Europe', 'SI': 'Central and Eastern Europe',
'PL': 'Central and Eastern Europe', 'HU': 'Central and Eastern Europe' }

regions = {}
for item in countries.values():
 regions.update(nuts_codes[item])

In [3]:
# Regional "GDP"
def get_gdp_region():
 params = {'unit': 'MIO_EUR', 'geo': list(regions.keys()), 'time': 2020}
 df = client.get_dataset('nama_10r_2gdp', params).to_dataframe()
 
 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df['Country'] = df['geo'].str[:2]
 df['EU Region'] = df['Country'].apply(lambda x: eu_regions[x])
 df.set_index('region_name', inplace=True) 
 df.rename(columns = {'values': 'GDP'}, inplace = True)
 df['GDP'] = df['GDP'] / 1000
 cols = ['Country', 'EU Region', 'GDP',]
 
 return df[cols]

In [4]:
# Regional "GDP" per capita
def get_gdp_capita_region():
 params = {'unit': 'EUR_HAB', 'time': 2020,
 'geo': list(regions.keys())}
 df = client.get_dataset('nama_10r_2gdp', params).to_dataframe()

 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.set_index('region_name', inplace=True) 
 df.rename(columns = {'values': 'GDP per Capita'}, inplace = True)
 df = df[['GDP per Capita']]
 
 return df

In [5]:
# Regional unemployment
def get_unemployment_region():
 params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2021,
 'age': 'Y15-74', 'isced11': 'TOTAL'} 
 df = client.get_dataset('lfst_r_lfu3rt', params).to_dataframe()

 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.rename(columns = {'values': 'Unemployment %'}, inplace = True)
 df.set_index('region_name', inplace=True) 
 df = df[['Unemployment %']]
 
 return df


In [6]:
# Life expectancy
def get_life_expectancy():
 params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2020}
 df = client.get_dataset('tgs00101', params).to_dataframe()

 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.rename(columns = {'values': 'Life Expectancy'}, inplace = True)
 df.set_index('region_name', inplace=True) 
 df = df[['Life Expectancy']]
 
 return df

In [7]:
# Get tertiary Educational attainment
def get_tertiary_education():
 params = {'sex': 'T', 'geo': list(regions.keys()), 'time': 2020}
 df = client.get_dataset('tgs00109', params).to_dataframe()
 
 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.rename(columns = {'values': 'Tertiary Educational Attainment %'}, inplace = True)
 df.set_index('region_name', inplace=True) 
 df = df[['Tertiary Educational Attainment %']]
 
 return df

In [8]:
# Population density
def get_population_density():
 params = {'geo': list(regions.keys()), 'time': 2019}
 df = client.get_dataset('tgs00024', params).to_dataframe()

 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.rename(columns = {'values': 'Population Density'}, inplace = True)
 df.set_index('region_name', inplace=True) 
 df = df[['Population Density']]
 
 return df


In [9]:
# Poverty Risk
def get_poverty_risk():
 params = {'geo': list(regions.keys()), 'time': 2019}
 df = client.get_dataset('ilc_peps11', params).to_dataframe()
 
 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.rename(columns = {'values': 'People at Risk of Poverty %'}, inplace = True)
 df.set_index('region_name', inplace=True) 
 df = df[['People at Risk of Poverty %']]
 
 return df


In [10]:
# Regional availability of doctors
def get_doctors():
 params = {'geo': list(regions.keys()), 'time': 2019,'unit': 'P_HTHAB',
 'isco08': 'OC221' }
 df = client.get_dataset('hlth_rs_prsrg', params).to_dataframe()

 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.rename(columns = {'values': 'Doctors per 100000'}, inplace = True)
 df.set_index('region_name', inplace=True) 
 df = df[['Doctors per 100000']]
 
 return df


In [11]:
# Get deaths in road accidents
def get_fatal_road_accidents():
 params = {'victim': 'KIL', 'geo': list(regions.keys()), 'time': 2020,
 'unit': 'P_MHAB'}
 df = client.get_dataset('tran_r_acci', params).to_dataframe()

 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.rename(columns = {'values': 'Fatal Road Accidents per Million'}, inplace = True)
 df.set_index('region_name', inplace=True) 
 df = df[['Fatal Road Accidents per Million']]
 
 return df


In [12]:
# Regular Internet Users

def get_regular_internet_users():
 params = {'indic_is': 'I_IDAY', 'geo': list(regions.keys()), 'time': 2021,
 'unit': 'PC_IND'}
 df = client.get_dataset('isoc_r_iuse_i', params).to_dataframe()

 # Remove NAs
 df.dropna(inplace = True)
 df['region_name'] = df['geo'].apply(lambda x: regions[x])
 df.rename(columns = {'values': 'Regular Internet Users %'}, inplace = True)
 df.set_index('region_name', inplace=True) 
 df = df[['Regular Internet Users %']]
 
 return df


In [13]:
# Get all data
df = get_gdp_region()
df = df.join(get_gdp_capita_region())
df = df.join(get_unemployment_region())
df = df.join(get_life_expectancy())
df = df.join(get_doctors())
df = df.join(get_fatal_road_accidents())
df = df.join(get_tertiary_education())
df = df.join(get_population_density())
df = df.join(get_poverty_risk())
df = df.join(get_regular_internet_users())

# Remove NAs
df.dropna(thresh = 4, inplace = True)

In [14]:
# Check cols
df.info()


Index: 242 entries, Abruzzo to Южен централен (Yuzhen tsentralen)
Data columns (total 12 columns):
 # Column Non-Null Count Dtype 
--- ------ -------------- ----- 
 0 Country 242 non-null object 
 1 EU Region 242 non-null object 
 2 GDP 242 non-null float64
 3 GDP per Capita 242 non-null float64
 4 Unemployment % 238 non-null float64
 5 Life Expectancy 238 non-null float64
 6 Doctors per 100000 173 non-null float64
 7 Fatal Road Accidents per Million 239 non-null float64
 8 Tertiary Educational Attainment % 238 non-null float64
 9 Population Density 239 non-null float64
 10 People at Risk of Poverty % 183 non-null float64
 11 Regular Internet Users % 169 non-null float64
dtypes: float64(10), object(2)
memory usage: 24.6+ KB


In [15]:
# Save to CSV
df.to_csv('../data/eu_regional_data.csv',
 float_format = '%.2f', encoding = 'utf-8')