File size: 2,184 Bytes
4eea983
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
import pandas as pd
import logging

# define logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("process_data.log"),
        logging.StreamHandler(),
    ],
)


CITIES_ENRICHED_OLD = os.path.join("data", "cities_enriched_old.csv")
CITIES_ENRICHED_NEW = os.path.join("data", "cities_enriched.csv")
CITIES_ENRICHED_FINAL = os.path.join("data", "cities_enriched_final.csv")
MISSING = os.path.join("data", "missing_final.csv")


def load_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    return df


def compare_cities(old: pd.DataFrame, new: pd.DataFrame) -> tuple:
    old_cities = old["Kommune"].unique()
    new_cities = new["Kommune"].unique()
    new_cities = set(new_cities) - set(old_cities)
    deleted_cities = set(old_cities) - set(new_cities)
    return new_cities, deleted_cities


def enrich_new(old, new) -> pd.DataFrame:
    missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
    for row in missing.itertuples():
        old_city = old[old["Kommune"] == row.Kommune]
        old_city_code = old[old["Code"] == row.Code]
        # print(type(old_city.Geometry.iloc[0]), old_city)
        if len(old_city) > 0:
            # print(new.iloc[row.Index, 2])
            new.at[row.Index, "Geometry"] = old_city["Geometry"].iloc[0]
        elif len(old_city_code) > 0:
            new.at[row.Index, "Code"] = old_city_code["Geometry"].iloc[0]
        # print(new.loc[[row.Index], ["Geometry"]])
    return new


def report_missing(new):
    missing = new[new["Geometry"].apply(lambda x: (x == "[]") or x is None)]
    logging.info(f"Finally missing cities: {missing['Kommune'].unique()}")
    return missing


if __name__ == "__main__":
    old = load_data(CITIES_ENRICHED_OLD)
    new = load_data(CITIES_ENRICHED_NEW)
    new_cities, deleted_cities = compare_cities(old, new)
    logging.info(f"New cities: {new_cities}")
    logging.info(f"Deleted cities: {deleted_cities}")
    new = enrich_new(old, new)
    new.to_csv(CITIES_ENRICHED_FINAL, index=False)
    missing = report_missing(new)
    missing.to_csv(MISSING, index=False)