from os.path import join, dirname
import numpy as np
import pandas as pd

if __name__ == "__main__":
    # Define the list of cities
    cities = [
        "Walvis Bay",
        "Keetmanshoop",
        "Warmbad",
        "Rundu",
        "Outapi",
        "Karibib",
        "Otjimbingwe",
        "Ondangwa",
        "Oranjemund",
        "Maltahohe",
        "Otavi",
        "Outjo",
        "Swakopmund",
        "Gobabis",
        "Karasburg",
        "Opuwo",
        "Hentiesbaai",
        "Katima Mulilo",
        "Oshikango",
        "Bethanie",
        "Ongandjera",
        "Mariental",
        "Bagani",
        "Nkurenkuru",
        "Usakos",
        "Rehoboth",
        "Aranos",
        "Omaruru",
        "Arandis",
        "Windhoek",
        "Khorixas",
        "Okahandja",
        "Grootfontein",
        "Tsumeb",
    ]

    csv_dtype = {"category": str, "country": str, "city": str}
    for split in ["train", "test"]:
        fp = join(
            dirname(dirname(__file__)), "datasets", "osv5m", f"{split}.csv"
        )

        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(fp, dtype=csv_dtype)

        # Check if the "country" column contains any of the cities in the list
        mask = df["city"].isin(cities)

        # If a city is found, set the corresponding rows in the "country" column to 'NMB'
        df.loc[mask, "country"] = "NMB"
        assert all(map(lambda x: isinstance(x, str), df["country"].unique().tolist()))

        # Drop the columns that are all NaN
        df.dropna(subset=["id", "latitude", "longitude"], inplace=True)

        # Save the modified DataFrame back to the CSV file
        df.to_csv(fp, index=False)