Spaces:
Sleeping
Sleeping
File size: 2,184 Bytes
4eea983 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
import pandas as pd
import logging
# define logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("process_data.log"),
logging.StreamHandler(),
],
)
CITIES_ENRICHED_OLD = os.path.join("data", "cities_enriched_old.csv")
CITIES_ENRICHED_NEW = os.path.join("data", "cities_enriched.csv")
CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv")
MISSING = os.path.join("data", "missing_final.csv")
def load_data(path: str) -> pd.DataFrame:
df = pd.read_csv(path)
return df
def compare_cities(old: pd.DataFrame, new: pd.DataFrame) -> tuple:
old_cities = old["Kommune"].unique()
new_cities = new["Kommune"].unique()
new_cities = set(new_cities) - set(old_cities)
deleted_cities = set(old_cities) - set(new_cities)
return new_cities, deleted_cities
def enrich_new(old, new) -> pd.DataFrame:
missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
for row in missing.itertuples():
old_city = old[old["Kommune"] == row.Kommune]
old_city_code = old[old["Code"] == row.Code]
# print(type(old_city.Geometry.iloc[0]), old_city)
if len(old_city) > 0:
# print(new.iloc[row.Index, 2])
new.at[row.Index, "Geometry"] = old_city["Geometry"].iloc[0]
elif len(old_city_code) > 0:
new.at[row.Index, "Code"] = old_city_code["Geometry"].iloc[0]
# print(new.loc[[row.Index], ["Geometry"]])
return new
def report_missing(new):
missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
logging.info(f"Finally missing cities: {missing['Kommune'].unique()}")
return missing
if __name__ == "__main__":
old = load_data(CITIES_ENRICHED_OLD)
new = load_data(CITIES_ENRICHED_NEW)
new_cities, deleted_cities = compare_cities(old, new)
logging.info(f"New cities: {new_cities}")
logging.info(f"Deleted cities: {deleted_cities}")
new = enrich_new(old, new)
new.to_csv(CITIES_ENRICHED_FINAL, index=False)
missing = report_missing(new)
missing.to_csv(MISSING, index=False)
|