Spaces:
Sleeping
Sleeping
# %% | |
import polars as pl | |
# Notice that the world health leaves missing as blanks in the csv. We need to explain that blanks aren't strings but missing values. | |
dat = pl.read_csv("../API_Download_DS2_en_csv_v2_5657328.csv", skip_rows=4, null_values = "") | |
# We don't like the World Banks wide format. Let's clean it upt to long format. | |
dat_long = dat.melt(id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"]) | |
# no we need to fix the year column and give it a better name. | |
# we could have fixed the name as an argument in `.melt()` as well. | |
# https://docs.pola.rs/user-guide/expressions/casting/#overflow | |
dat_long = dat_long\ | |
.with_columns( | |
pl.col("variable").cast(pl.Int64, strict=False).alias("variable"), | |
pl.col("value").cast(pl.Float32, strict=False).alias("value"))\ | |
.rename({"variable":"year"})\ | |
.filter(pl.col("value").is_not_null()) | |
# %% | |
# Can we split out the information in the indicator Code | |
indicator_columns = dat_long\ | |
.select( | |
pl.col("Indicator Code"), | |
pl.col("Indicator Code")\ | |
# split string: example VC.IDP.TOCV into list object ["VC", "IDP", "TOCV"] | |
.str.split_exact(".", 6).alias("split")).unnest("split")\ | |
.unique() | |
# What should we call these columns? | |
# https://datahelpdesk.worldbank.org/knowledgebase/articles/201175-how-does-the-world-bank-code-its-indicators | |
new_names = {"field_0":"topic", "field_1":"general_subj", "field_2":"specific_subj", | |
"field_3":"ext_1", "field_4":"ext_2", "field_5":"ext_3", "field_6":"ext_4"} | |
indicator_columns = indicator_columns.rename(new_names) | |
# %% | |
# now we need to finalize our munge and write our data | |
dat_final = dat_long.join(indicator_columns, how="left", on="Indicator Code") | |
# Now I want to reorder the columns | |
names = dat_final.columns | |
new_order = [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0] | |
name_order = [names[i] for i in new_order] | |
dat_final = dat_final.select(name_order) | |
# %% | |
# write data | |
dat_final.write_csv("../dat_munged.csv") | |
# %% | |