File size: 4,121 Bytes
94f372a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import json
import joblib
import pandas as pd
import numpy as np
import reverse_geocoder
from os.path import join, dirname


class QuadTree(object):
    def __init__(

        self, data, mins=None, maxs=None, id="", depth=3, min_split=0, do_split=1000

    ):
        self.id = id
        self.data = data

        if mins is None:
            mins = data[["latitude", "longitude"]].to_numpy().min(0)
        if maxs is None:
            maxs = data[["latitude", "longitude"]].to_numpy().max(0)

        self.mins = np.asarray(mins)
        self.maxs = np.asarray(maxs)
        self.sizes = self.maxs - self.mins

        self.children = []

        mids = 0.5 * (self.mins + self.maxs)
        xmin, ymin = self.mins
        xmax, ymax = self.maxs
        xmid, ymid = mids

        if depth > 0 and len(self.data) >= do_split:
            # split the data into four quadrants
            data_q1 = data[(data["latitude"] < mids[0]) & (data["longitude"] < mids[1])]
            data_q2 = data[
                (data["latitude"] < mids[0]) & (data["longitude"] >= mids[1])
            ]
            data_q3 = data[
                (data["latitude"] >= mids[0]) & (data["longitude"] < mids[1])
            ]
            data_q4 = data[
                (data["latitude"] >= mids[0]) & (data["longitude"] >= mids[1])
            ]

            # recursively build a quad tree on each quadrant which has data
            if data_q1.shape[0] > min_split:
                self.children.append(
                    QuadTree(data_q1, [xmin, ymin], [xmid, ymid], id + "0", depth - 1)
                )
            if data_q2.shape[0] > min_split:
                self.children.append(
                    QuadTree(data_q2, [xmin, ymid], [xmid, ymax], id + "1", depth - 1)
                )
            if data_q3.shape[0] > min_split:
                self.children.append(
                    QuadTree(data_q3, [xmid, ymin], [xmax, ymid], id + "2", depth - 1)
                )
            if data_q4.shape[0] > min_split:
                self.children.append(
                    QuadTree(data_q4, [xmid, ymid], [xmax, ymax], id + "3", depth - 1)
                )

    def unwrap(self):
        if len(self.children) == 0:
            return {self.id: [self.mins, self.maxs, self.data.copy()]}
        else:
            d = dict()
            for child in self.children:
                d.update(child.unwrap())
            return d


def extract(qt):
    cluster = qt.unwrap()
    boundaries, data = {}, []
    for id, vs in cluster.items():
        (min_lat, min_lon), (max_lat, max_lon), points = vs
        points["category"] = id
        data.append(points)
        boundaries[id] = (
            float(min_lat),
            float(min_lon),
            float(max_lat),
            float(max_lon),
        )

    data = pd.concat(data)
    return boundaries, data


if __name__ == "__main__":
    # merge into one DataFrame
    data_path = join(dirname(dirname(__file__)), "datasets", "osv5m")
    train_fp = join(data_path, f"train.csv")
    test_fp = join(data_path, f"test.csv")

    df_train = pd.read_csv(train_fp)
    df_train["split"] = "train"

    df_test = pd.read_csv(test_fp)
    df_test["split"] = "test"

    df = pd.concat([df_train, df_test])
    size_before = df.shape[0]
    qt = QuadTree(df, depth=15)
    boundaries, df = extract(qt)
    assert df.shape[0] == size_before

    location = reverse_geocoder.search(
        [(lat, lon) for lat, lon in zip(df["latitude"], df["longitude"])]
    )
    df["city"] = [l.get("name", "") for l in location]
    df["country"] = [l.get("cc", "") for l in location]
    del location

    df_train = df[df["split"] == "train"].drop(["split"], axis=1)
    df_test = df[df["split"] == "test"].drop(["split"], axis=1)
    assert (df_train.shape[0] + df_test.shape[0]) == size_before

    json.dump(boundaries, open(join(data_path, "borders.json"), "w"))
    df_train.to_csv(train_fp, index=False)
    df_test.to_csv(test_fp, index=False)