File size: 19,378 Bytes
134cb11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import pandas as pd 
import re
import json

def qfabric_semiconverted_to_geochat_dataset_format(json_file):
    with open(json_file) as f:
        data = json.load(f)
    for conversation_group in data:
        for item in conversation_group["conversations"]:
            # Remove satellite specifications
            item["value"] = re.sub(r"This is a satellite image :", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image:", "", item["value"])
            item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image from.*?:\s*", "", item["value"])
            # Remove strings around <identify> that are redundant
            item["value"] = re.sub(r'What is <identify>|this area {<', lambda x: '[identify]' if 'What is [identify]' in x.group() else '{<', item["value"])
            # Switch out <video> for <image>
            item["value"] = re.sub(r'<video>', '', item["value"])
            # Get rid of "this region" immediately before the bounding box 
            item["value"] = re.sub(r'this region {<', '{<', item["value"])
            # Check for the presence of '<identify>' and modify the string accordingly
            if '[identify]' in item["value"]:
                # Find the position of '<identify>' and the position of the first occurrence of '>}' after '<identify>'
                identify_index = item["value"].find('[identify]')
                identify_word_index = item["value"].find('Identify ', identify_index + 8)  
                # if identify_word_index != -1:
                #     item["value"] = item["value"][:identify_word_index] + item["value"][identify_word_index + 8:]
                closing_brace_index = item["value"].find('>}', identify_index)
    return data

def fmow_to_geochat_dataset_format(json_file):
    with open(json_file) as f:
        data = json.load(f)
    for i, entry in enumerate(data):
        video_count = len(entry.get("video", []))
        if video_count > 1:
            original_videos = entry["video"]
            for idx in range(video_count):
                new_entry = entry.copy()
                new_entry['video'] = [original_videos[idx]]
                new_entry['image'] = original_videos[idx]
                new_entry['linked_id'] = entry['id']
                new_entry['img_idx_from_video_lst_id'] = idx
                data.append(new_entry)
        else: 
            new_entry = entry.copy()
            new_entry['image'] = original_videos[0]
    for conversation_group in data:
        for item in conversation_group["conversations"]:
            item["value"] = re.sub(r"This is a satellite image :", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image:", "", item["value"])
            item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image from.*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of low-resolution, optical satellite images capturing the same location at different times: ", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of high-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of satellite images capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of satellite images from .*? the same location at different times:", "", item["value"])
            item["value"] = re.sub(r'This is a high resolution,? optical satellite image .*:\s*<image>\n', '\n', item["value"])
            item["value"] = re.sub(r'^This is a high[- ]resolution,? .*?image:\s*<image>\n', '\n', item["value"], flags=re.IGNORECASE | re.DOTALL)
            
            # Switch out <video> for <image>
            item["value"] = re.sub(r'<video>', '', item["value"])
            # Get rid of "this region" immediately before the bounding box 
            item["value"] = re.sub(r'this region {<', '{<', item["value"])
            # Which class 
            item["value"] = re.sub(r'Which of the following classes does this sequence of images belong to', 'Which of the following classes does this image belong to', item["value"])
            # Please answer using one of the following classes:
            item["value"] = re.sub(r'Please answer using only one of the following classes:', 'Please use one of the following classes:', item["value"])
            # Check for the presence of '<identify>' and modify the string accordingly
            if '[identify]' in item["value"]:
                # Find the position of '<identify>' and the position of the first occurrence of '>}' after '<identify>'
                identify_index = item["value"].find('[identify]')
    for i, entry in enumerate(data):
        video_count = len(entry.get("video", []))
        if video_count > 1:
            data.pop(i)
    return data

def xbd_to_geochat_dataset_format(json_file):

    with open(json_file) as f:
        data = json.load(f)

    new_data = []
    for i, entry in enumerate(data):
        if entry["task"].startswith("localization"):
            new_entry=entry.copy()
            new_entry['image'] = entry['video'][0]
            new_data.append(new_entry)
        if entry["task"].startswith("classification"):
            new_entry=entry.copy()
            new_entry['image'] = entry['video'][1]
            new_data.append(new_entry)
        # Auxiliary tasks all look at the second image
        else: 
            new_entry=entry.copy()
            new_entry['image'] = entry['video'][1]
            new_data.append(new_entry)
            
    for conversation_group in new_data:
        localization=False
        classification=False
        # Add a [refer] token to localization tasks
        if conversation_group["task"].startswith("localization") or "identify" in conversation_group["task"].lower():
            localization=True
        # Add a [identify] token to classification tasks
        if conversation_group["task"].startswith("classification"):
            classification=True
        
        for item in conversation_group["conversations"]:
            item["value"] = re.sub(r"This is a satellite image :", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image:", "", item["value"])
            item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"])
            item["value"] = re.sub(r"These are two satellite images from .*? capturing the same location at different times: ", "", item["value"])
            item["value"] = re.sub(r"These are two low-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"These are two high-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"These are two high-resolution, optical satellite images from .*? capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"These are two satellite images capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"These are two satellite images from .*? capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r'These are two high-resolution,? optical satellite images .*:\s*<image>\n', '<image>\n', item["value"])
            item["value"] = re.sub(r'These are two high resolution,? optical satellite images .*:\s*<image>\n', '<image>\n', item["value"])
            item["value"] = re.sub(r'^This is a high[- ]resolution,? .*?image:\s*<image>\n', '<image>\n', item["value"], flags=re.IGNORECASE | re.DOTALL)

            # Switch out <video> for <image>
            if classification:
                item["value"] = re.sub(r'<video> \n', '<image> \n [identify] ', item["value"])
                item["value"] = re.sub(r' in the second image.', '.', item["value"])
            elif localization:
                item["value"] = re.sub(r'<video> \n', '<image> \n [refer] ', item["value"])
                item["value"] = re.sub(r'Image 1', 'the image', item["value"])
            else:
                item["value"] = re.sub(r'<video> \n', '<image> \n ', item["value"])

            # Replace temporal/multi-image wording for auxiliary tasks
            replacements = {
                'Are there any buildings in the first image which have been damaged in the second image? Answer with one word.': 'Are there any damaged buildings in the image? Answer with one word.',
                'Have any buildings in the first image been damaged in the second image? Answer with one word.': 'Have any buildings been damaged in the area? Answer with one word.',
                'What disaster has occurred between the first and second image?': 'What disaster has occurred here?',
                'Identify the buildings in the first image which were severely damaged or destroyed in the second image. Include a bounding box of the form [x_min, y_min, x_max, y_max] for each identified building in your response. If there are no such buildings, do not output a bounding box.': 'Identify the severely damaged or destroyed buildings in the image. Include a bounding box of the form [x_min, y_min, x_max, y_max] for each identified building in your response. If there are no such buildings, do not output a bounding box.'
            }
            for old, new in replacements.items():
                item['value'] = re.sub(re.escape(old), new, item['value'])


            # Get rid of "this region" immediately before the bounding box 
            item["value"] = re.sub(r'this region {<', '{<', item["value"])
            # Which class 
            item["value"] = re.sub(r'Which of the following classes does this sequence of images belong to', 'Which of the following classes does this image belong to', item["value"])
            # Please answer using one of the following classes:
            item["value"] = re.sub(r'Please answer using only one of the following classes:', 'Please use one of the following classes:', item["value"])
            # Replace bounding box format [79, 27, 85, 81] with {<79><27><85><81>|<0>}
            item["value"] = re.sub(r'\[(\d+), (\d+), (\d+), (\d+)\]', r'{<\1><\2><\3><\4>|<0>}', item["value"])
            # Replace bounding box format [x_min, y_min, x_max, y_max] with {<x_min><y_min><x_max><y_max>|<0>}
            item["value"] = re.sub(r'\[(x_min), (y_min), (x_max), (y_max)\]', r'{<\1><\2><\3><\4>|<0>}', item["value"])
    return new_data

def s2looking_to_geochat_dataset_format(json_file):
    with open(json_file) as f:
        data = json.load(f)

    question = "<image>\n [refer] Identify all buildings in the image."

    new_dataset = []
    for elem in data:
        for i in range(2):
            new_item = {}
            new_item['id'] = elem['id'] + '_' + str(i)
            new_item['metadata'] = elem['metadata'][i]
            new_item['original_input_polygon'] = elem['original_input_polygon']
            new_item['task'] = elem['task']
            new_item['image'] = elem['video'][i]
            new_item['geovlm_id'] = i
            new_item['original_conversation'] = elem['conversations']
            new_item['conversations'] = [
                {
                    "from": "human",
                    "value": question
                },
                {
                    "from": "gpt",
                    "value": ""
                }
                ]
            new_dataset.append(new_item)
    
    data = new_dataset

    for conversation_group in data:
        for item in conversation_group["conversations"]:
            # Check if the sentence starts with "This is" or "These are" and contains "<image>"
            if (item["value"].startswith("This is") or item["value"].startswith("These are")) and "<image>" in item["value"]:
                colon_index = item["value"].find(":")
                if colon_index != -1 and item["value"][colon_index+1:].strip().startswith("<image>"):
                    item["value"] = item["value"][colon_index+1:].strip()
            item["value"] = re.sub(r"This is a sequence of high-resolution, optical satellite images from Maxar's GeoEye-1, QuickBird-2, WorldView-2, or WorldView-3 capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of low-resolution, optical satellite images from Sentinel-2 capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image :", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image:", "", item["value"])
            item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"])
            item["value"] = re.sub(r"This is a satellite image from.*?:\s*", "", item["value"])
            # This one is the one I'm referring to: 
            item["value"] = re.sub(r'^This is a sequence of.*times:$', '', item["value"])
            item["value"] = re.sub(r"This is a sequence of high-resolution, optical satellite images from .*? capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of low-resolution, optical satellite images capturing the same location at different times: ", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of high-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of satellite images capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of satellite images from .*? the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"These are two high-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
            item["value"] = re.sub(r"This is a sequence of images from the satellites GaoFen, SuperView and BeiJing-2, capturing the same location at different times:", "", item["value"])
            
            # Switch out <video> for <image>
            item["value"] = re.sub(r'<video>', '', item["value"])
            # Get rid of "this region" immediately before the bounding box 
            item["value"] = re.sub(r'this region {<', '{<', item["value"])
            # Which class 
            item["value"] = re.sub(r'Which of the following classes does this sequence of images belong to', 'Which of the following classes does this image belong to', item["value"])
            # Please answer using one of the following classes:
            item["value"] = re.sub(r'Please answer using only one of the following classes:', 'Please use one of the following classes:', item["value"])
            # Check for the presence of '<identify>' and modify the string accordingly
            if '[identify]' in item["value"]:
                # Find the position of '<identify>' and the position of the first occurrence of '>}' after '<identify>'
                identify_index = item["value"].find('[identify]')
                closing_brace_index = item["value"].find('>}', identify_index)

            # Fix the bounding box format: 
    for conversation_group in data:
        for item in conversation_group["conversations"]:
            # Replace bounding box format [79, 27, 85, 81] with {<79><27><85><81>|<0>}
            item["value"] = re.sub(r'\[(\d+), (\d+), (\d+), (\d+)\]', r'{<\1><\2><\3><\4>|<0>}', item["value"])
            # Replace bounding box format [x_min, y_min, x_max, y_max] with {<x_min><y_min><x_max><y_max>|<0>}
            item["value"] = re.sub(r'\[(x_min), (y_min), (x_max), (y_max)\]', r'{<\1><\2><\3><\4>|<0>}', item["value"])
    for i, entry in enumerate(data):
        video_count = len(entry.get("video", []))
        if video_count > 1:
            data.pop(i)
    return data

def check_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    for conversation_group in data:
        for item in conversation_group["conversations"]:
            if '<image>' not in item["value"]:
                if item["from"] != 'gpt':
                    print(f"Missing <image> in: {item}")
            if any(sentence.strip().startswith(('This is', 'These are')) for sentence in item["value"].split('.')):
                print(f"Starts with 'This is' or 'These are' in: {item}")
if __name__ == "__main__":

    # Paths to datasets
    fmow_0 = "/scr/geovlm/fmow_low_res_val.json"
    fmow_1 = "/scr/geovlm/fmow_high_res_val.json"

    qfabric_0 = '/scr/geovlm/QFabric/test_geochat_seqlen_5_256.json'
    qfabric_1 = '/scr/geovlm/QFabric/test_geochat_seqlen_2_256.json'

    xbd_0 = '/scr/geovlm/xbd_test_auxiliary_multi_image.json'
    xbd_1 = '/scr/geovlm/xbd_test_canon_classification.json'
    xbd_2 = '/scr/geovlm/xbd_test_canon_localization.json'

    print("Running conversion on all datasets, storing updated datasets in variables")

    from tqdm import tqdm

    dataset_formats = [
        (fmow_to_geochat_dataset_format, fmow_0),
        (fmow_to_geochat_dataset_format, fmow_1),
    ]
    formatted_datasets = []
    for format_func, dataset in tqdm(dataset_formats, desc="Converting datasets"):
        if "xbd_test_auxiliary" in dataset: 
            formatted_datasets.append(format_func(dataset))
    
    fmow_0_formatted, fmow_1_formatted = formatted_datasets

    # Write the formatted data for fmow_0 into a JSON file named geochat_fmow_RECENT_format_low_res.json
    with open('/scr/geovlm/geochat_fmow_RECENT_format_low_res.json', 'w') as file:
        json.dump(fmow_0_formatted, file)

    # Write the formatted data for fmow_1 into a JSON file named geochat_fmow_RECENT_format_low_res_AGG.json
    with open('/scr/geovlm/geochat_fmow_RECENT_format_high_res.json', 'w') as file:
        json.dump(fmow_1_formatted, file)
    
    check_file('/scr/geovlm/geochat_fmow_RECENT_format_low_res.json')
    check_file('/scr/geovlm/geochat_fmow_RECENT_format_high_res.json')