TEOChat / videollava /eval /datasets_into_geochat_format.py
jirvin16's picture
Initial commit
134cb11
import pandas as pd
import re
import json
def qfabric_semiconverted_to_geochat_dataset_format(json_file):
with open(json_file) as f:
data = json.load(f)
for conversation_group in data:
for item in conversation_group["conversations"]:
# Remove satellite specifications
item["value"] = re.sub(r"This is a satellite image :", "", item["value"])
item["value"] = re.sub(r"This is a satellite image:", "", item["value"])
item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a satellite image from.*?:\s*", "", item["value"])
# Remove strings around <identify> that are redundant
item["value"] = re.sub(r'What is <identify>|this area {<', lambda x: '[identify]' if 'What is [identify]' in x.group() else '{<', item["value"])
# Switch out <video> for <image>
item["value"] = re.sub(r'<video>', '', item["value"])
# Get rid of "this region" immediately before the bounding box
item["value"] = re.sub(r'this region {<', '{<', item["value"])
# Check for the presence of '<identify>' and modify the string accordingly
if '[identify]' in item["value"]:
# Find the position of '<identify>' and the position of the first occurrence of '>}' after '<identify>'
identify_index = item["value"].find('[identify]')
identify_word_index = item["value"].find('Identify ', identify_index + 8)
# if identify_word_index != -1:
# item["value"] = item["value"][:identify_word_index] + item["value"][identify_word_index + 8:]
closing_brace_index = item["value"].find('>}', identify_index)
return data
def fmow_to_geochat_dataset_format(json_file):
with open(json_file) as f:
data = json.load(f)
for i, entry in enumerate(data):
video_count = len(entry.get("video", []))
if video_count > 1:
original_videos = entry["video"]
for idx in range(video_count):
new_entry = entry.copy()
new_entry['video'] = [original_videos[idx]]
new_entry['image'] = original_videos[idx]
new_entry['linked_id'] = entry['id']
new_entry['img_idx_from_video_lst_id'] = idx
data.append(new_entry)
else:
new_entry = entry.copy()
new_entry['image'] = original_videos[0]
for conversation_group in data:
for item in conversation_group["conversations"]:
item["value"] = re.sub(r"This is a satellite image :", "", item["value"])
item["value"] = re.sub(r"This is a satellite image:", "", item["value"])
item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a satellite image from.*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a sequence of low-resolution, optical satellite images capturing the same location at different times: ", "", item["value"])
item["value"] = re.sub(r"This is a sequence of high-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"This is a sequence of satellite images capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"This is a sequence of satellite images from .*? the same location at different times:", "", item["value"])
item["value"] = re.sub(r'This is a high resolution,? optical satellite image .*:\s*<image>\n', '\n', item["value"])
item["value"] = re.sub(r'^This is a high[- ]resolution,? .*?image:\s*<image>\n', '\n', item["value"], flags=re.IGNORECASE | re.DOTALL)
# Switch out <video> for <image>
item["value"] = re.sub(r'<video>', '', item["value"])
# Get rid of "this region" immediately before the bounding box
item["value"] = re.sub(r'this region {<', '{<', item["value"])
# Which class
item["value"] = re.sub(r'Which of the following classes does this sequence of images belong to', 'Which of the following classes does this image belong to', item["value"])
# Please answer using one of the following classes:
item["value"] = re.sub(r'Please answer using only one of the following classes:', 'Please use one of the following classes:', item["value"])
# Check for the presence of '<identify>' and modify the string accordingly
if '[identify]' in item["value"]:
# Find the position of '<identify>' and the position of the first occurrence of '>}' after '<identify>'
identify_index = item["value"].find('[identify]')
for i, entry in enumerate(data):
video_count = len(entry.get("video", []))
if video_count > 1:
data.pop(i)
return data
def xbd_to_geochat_dataset_format(json_file):
with open(json_file) as f:
data = json.load(f)
new_data = []
for i, entry in enumerate(data):
if entry["task"].startswith("localization"):
new_entry=entry.copy()
new_entry['image'] = entry['video'][0]
new_data.append(new_entry)
if entry["task"].startswith("classification"):
new_entry=entry.copy()
new_entry['image'] = entry['video'][1]
new_data.append(new_entry)
# Auxiliary tasks all look at the second image
else:
new_entry=entry.copy()
new_entry['image'] = entry['video'][1]
new_data.append(new_entry)
for conversation_group in new_data:
localization=False
classification=False
# Add a [refer] token to localization tasks
if conversation_group["task"].startswith("localization") or "identify" in conversation_group["task"].lower():
localization=True
# Add a [identify] token to classification tasks
if conversation_group["task"].startswith("classification"):
classification=True
for item in conversation_group["conversations"]:
item["value"] = re.sub(r"This is a satellite image :", "", item["value"])
item["value"] = re.sub(r"This is a satellite image:", "", item["value"])
item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"])
item["value"] = re.sub(r"These are two satellite images from .*? capturing the same location at different times: ", "", item["value"])
item["value"] = re.sub(r"These are two low-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"These are two high-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"These are two high-resolution, optical satellite images from .*? capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"These are two satellite images capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"These are two satellite images from .*? capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r'These are two high-resolution,? optical satellite images .*:\s*<image>\n', '<image>\n', item["value"])
item["value"] = re.sub(r'These are two high resolution,? optical satellite images .*:\s*<image>\n', '<image>\n', item["value"])
item["value"] = re.sub(r'^This is a high[- ]resolution,? .*?image:\s*<image>\n', '<image>\n', item["value"], flags=re.IGNORECASE | re.DOTALL)
# Switch out <video> for <image>
if classification:
item["value"] = re.sub(r'<video> \n', '<image> \n [identify] ', item["value"])
item["value"] = re.sub(r' in the second image.', '.', item["value"])
elif localization:
item["value"] = re.sub(r'<video> \n', '<image> \n [refer] ', item["value"])
item["value"] = re.sub(r'Image 1', 'the image', item["value"])
else:
item["value"] = re.sub(r'<video> \n', '<image> \n ', item["value"])
# Replace temporal/multi-image wording for auxiliary tasks
replacements = {
'Are there any buildings in the first image which have been damaged in the second image? Answer with one word.': 'Are there any damaged buildings in the image? Answer with one word.',
'Have any buildings in the first image been damaged in the second image? Answer with one word.': 'Have any buildings been damaged in the area? Answer with one word.',
'What disaster has occurred between the first and second image?': 'What disaster has occurred here?',
'Identify the buildings in the first image which were severely damaged or destroyed in the second image. Include a bounding box of the form [x_min, y_min, x_max, y_max] for each identified building in your response. If there are no such buildings, do not output a bounding box.': 'Identify the severely damaged or destroyed buildings in the image. Include a bounding box of the form [x_min, y_min, x_max, y_max] for each identified building in your response. If there are no such buildings, do not output a bounding box.'
}
for old, new in replacements.items():
item['value'] = re.sub(re.escape(old), new, item['value'])
# Get rid of "this region" immediately before the bounding box
item["value"] = re.sub(r'this region {<', '{<', item["value"])
# Which class
item["value"] = re.sub(r'Which of the following classes does this sequence of images belong to', 'Which of the following classes does this image belong to', item["value"])
# Please answer using one of the following classes:
item["value"] = re.sub(r'Please answer using only one of the following classes:', 'Please use one of the following classes:', item["value"])
# Replace bounding box format [79, 27, 85, 81] with {<79><27><85><81>|<0>}
item["value"] = re.sub(r'\[(\d+), (\d+), (\d+), (\d+)\]', r'{<\1><\2><\3><\4>|<0>}', item["value"])
# Replace bounding box format [x_min, y_min, x_max, y_max] with {<x_min><y_min><x_max><y_max>|<0>}
item["value"] = re.sub(r'\[(x_min), (y_min), (x_max), (y_max)\]', r'{<\1><\2><\3><\4>|<0>}', item["value"])
return new_data
def s2looking_to_geochat_dataset_format(json_file):
with open(json_file) as f:
data = json.load(f)
question = "<image>\n [refer] Identify all buildings in the image."
new_dataset = []
for elem in data:
for i in range(2):
new_item = {}
new_item['id'] = elem['id'] + '_' + str(i)
new_item['metadata'] = elem['metadata'][i]
new_item['original_input_polygon'] = elem['original_input_polygon']
new_item['task'] = elem['task']
new_item['image'] = elem['video'][i]
new_item['geovlm_id'] = i
new_item['original_conversation'] = elem['conversations']
new_item['conversations'] = [
{
"from": "human",
"value": question
},
{
"from": "gpt",
"value": ""
}
]
new_dataset.append(new_item)
data = new_dataset
for conversation_group in data:
for item in conversation_group["conversations"]:
# Check if the sentence starts with "This is" or "These are" and contains "<image>"
if (item["value"].startswith("This is") or item["value"].startswith("These are")) and "<image>" in item["value"]:
colon_index = item["value"].find(":")
if colon_index != -1 and item["value"][colon_index+1:].strip().startswith("<image>"):
item["value"] = item["value"][colon_index+1:].strip()
item["value"] = re.sub(r"This is a sequence of high-resolution, optical satellite images from Maxar's GeoEye-1, QuickBird-2, WorldView-2, or WorldView-3 capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"This is a sequence of low-resolution, optical satellite images from Sentinel-2 capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"This is a satellite image :", "", item["value"])
item["value"] = re.sub(r"This is a satellite image:", "", item["value"])
item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"])
item["value"] = re.sub(r"This is a satellite image from.*?:\s*", "", item["value"])
# This one is the one I'm referring to:
item["value"] = re.sub(r'^This is a sequence of.*times:$', '', item["value"])
item["value"] = re.sub(r"This is a sequence of high-resolution, optical satellite images from .*? capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"This is a sequence of low-resolution, optical satellite images capturing the same location at different times: ", "", item["value"])
item["value"] = re.sub(r"This is a sequence of high-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"This is a sequence of satellite images capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"This is a sequence of satellite images from .*? the same location at different times:", "", item["value"])
item["value"] = re.sub(r"These are two high-resolution, optical satellite images capturing the same location at different times:", "", item["value"])
item["value"] = re.sub(r"This is a sequence of images from the satellites GaoFen, SuperView and BeiJing-2, capturing the same location at different times:", "", item["value"])
# Switch out <video> for <image>
item["value"] = re.sub(r'<video>', '', item["value"])
# Get rid of "this region" immediately before the bounding box
item["value"] = re.sub(r'this region {<', '{<', item["value"])
# Which class
item["value"] = re.sub(r'Which of the following classes does this sequence of images belong to', 'Which of the following classes does this image belong to', item["value"])
# Please answer using one of the following classes:
item["value"] = re.sub(r'Please answer using only one of the following classes:', 'Please use one of the following classes:', item["value"])
# Check for the presence of '<identify>' and modify the string accordingly
if '[identify]' in item["value"]:
# Find the position of '<identify>' and the position of the first occurrence of '>}' after '<identify>'
identify_index = item["value"].find('[identify]')
closing_brace_index = item["value"].find('>}', identify_index)
# Fix the bounding box format:
for conversation_group in data:
for item in conversation_group["conversations"]:
# Replace bounding box format [79, 27, 85, 81] with {<79><27><85><81>|<0>}
item["value"] = re.sub(r'\[(\d+), (\d+), (\d+), (\d+)\]', r'{<\1><\2><\3><\4>|<0>}', item["value"])
# Replace bounding box format [x_min, y_min, x_max, y_max] with {<x_min><y_min><x_max><y_max>|<0>}
item["value"] = re.sub(r'\[(x_min), (y_min), (x_max), (y_max)\]', r'{<\1><\2><\3><\4>|<0>}', item["value"])
for i, entry in enumerate(data):
video_count = len(entry.get("video", []))
if video_count > 1:
data.pop(i)
return data
def check_file(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
for conversation_group in data:
for item in conversation_group["conversations"]:
if '<image>' not in item["value"]:
if item["from"] != 'gpt':
print(f"Missing <image> in: {item}")
if any(sentence.strip().startswith(('This is', 'These are')) for sentence in item["value"].split('.')):
print(f"Starts with 'This is' or 'These are' in: {item}")
if __name__ == "__main__":
# Paths to datasets
fmow_0 = "/scr/geovlm/fmow_low_res_val.json"
fmow_1 = "/scr/geovlm/fmow_high_res_val.json"
qfabric_0 = '/scr/geovlm/QFabric/test_geochat_seqlen_5_256.json'
qfabric_1 = '/scr/geovlm/QFabric/test_geochat_seqlen_2_256.json'
xbd_0 = '/scr/geovlm/xbd_test_auxiliary_multi_image.json'
xbd_1 = '/scr/geovlm/xbd_test_canon_classification.json'
xbd_2 = '/scr/geovlm/xbd_test_canon_localization.json'
print("Running conversion on all datasets, storing updated datasets in variables")
from tqdm import tqdm
dataset_formats = [
(fmow_to_geochat_dataset_format, fmow_0),
(fmow_to_geochat_dataset_format, fmow_1),
]
formatted_datasets = []
for format_func, dataset in tqdm(dataset_formats, desc="Converting datasets"):
if "xbd_test_auxiliary" in dataset:
formatted_datasets.append(format_func(dataset))
fmow_0_formatted, fmow_1_formatted = formatted_datasets
# Write the formatted data for fmow_0 into a JSON file named geochat_fmow_RECENT_format_low_res.json
with open('/scr/geovlm/geochat_fmow_RECENT_format_low_res.json', 'w') as file:
json.dump(fmow_0_formatted, file)
# Write the formatted data for fmow_1 into a JSON file named geochat_fmow_RECENT_format_low_res_AGG.json
with open('/scr/geovlm/geochat_fmow_RECENT_format_high_res.json', 'w') as file:
json.dump(fmow_1_formatted, file)
check_file('/scr/geovlm/geochat_fmow_RECENT_format_low_res.json')
check_file('/scr/geovlm/geochat_fmow_RECENT_format_high_res.json')