import os import pandas as pd import logging # define logger logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("process_data.log"), logging.StreamHandler(), ], ) CITIES_ENRICHED_OLD = os.path.join("data", "cities_enriched_old.csv") CITIES_ENRICHED_NEW = os.path.join("data", "cities_enriched.csv") CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv") MISSING = os.path.join("data", "missing_final.csv") def load_data(path: str) -> pd.DataFrame: df = pd.read_csv(path) return df def compare_cities(old: pd.DataFrame, new: pd.DataFrame) -> tuple: old_cities = old["Kommune"].unique() new_cities = new["Kommune"].unique() new_cities = set(new_cities) - set(old_cities) deleted_cities = set(old_cities) - set(new_cities) return new_cities, deleted_cities def enrich_new(old, new) -> pd.DataFrame: missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)] for row in missing.itertuples(): old_city = old[old["Kommune"] == row.Kommune] old_city_code = old[old["Code"] == row.Code] # print(type(old_city.Geometry.iloc[0]), old_city) if len(old_city) > 0: # print(new.iloc[row.Index, 2]) new.at[row.Index, "Geometry"] = old_city["Geometry"].iloc[0] elif len(old_city_code) > 0: new.at[row.Index, "Code"] = old_city_code["Geometry"].iloc[0] # print(new.loc[[row.Index], ["Geometry"]]) return new def report_missing(new): missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)] logging.info(f"Finally missing cities: {missing['Kommune'].unique()}") return missing if __name__ == "__main__": old = load_data(CITIES_ENRICHED_OLD) new = load_data(CITIES_ENRICHED_NEW) new_cities, deleted_cities = compare_cities(old, new) logging.info(f"New cities: {new_cities}") logging.info(f"Deleted cities: {deleted_cities}") new = enrich_new(old, new) new.to_csv(CITIES_ENRICHED_FINAL, index=False) missing = report_missing(new) missing.to_csv(MISSING, index=False)