import pandas as pd import re import json def qfabric_semiconverted_to_geochat_dataset_format(json_file): with open(json_file) as f: data = json.load(f) for conversation_group in data: for item in conversation_group["conversations"]: # Remove satellite specifications item["value"] = re.sub(r"This is a satellite image :", "", item["value"]) item["value"] = re.sub(r"This is a satellite image:", "", item["value"]) item["value"] = re.sub(r"This is a high resolution, optical satellite image .*?:\s*", "", item["value"]) item["value"] = re.sub(r"This is a high resolution, optical satellite image.*?:\s*", "", item["value"]) item["value"] = re.sub(r"This is a satellite image from .*?:\s*", "", item["value"]) item["value"] = re.sub(r"This is a satellite image from.*?:\s*", "", item["value"]) # Remove strings around that are redundant item["value"] = re.sub(r'What is |this area {<', lambda x: '[identify]' if 'What is [identify]' in x.group() else '{<', item["value"]) # Switch out