import json import hashlib import io import os import base64 from PIL import Image from tqdm import tqdm def calculate_md5(image): md5_hash = hashlib.md5() with io.BytesIO() as output: image.save(output, format='JPEG') image_data = output.getvalue() md5_hash.update(image_data) return md5_hash.hexdigest() def process_files(directory): tsv_data = [] for file in tqdm(os.listdir(directory)): if file.endswith('.json'): json_path = os.path.join(directory, file) jpg_path = os.path.join(directory, file.replace('.json', '.jpg')) with open(json_path, 'r') as json_file: data = json.load(json_file) image = Image.open(jpg_path) md5 = calculate_md5(image) caption = data['caption'] width = data['width'] height = data['height'] with io.BytesIO() as buffer: image.save(buffer, format='JPEG') image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8") combined_data_str = {'phrase': data['noun_chunks'], 'expression_v1': data['ref_exps']} tsv_row = [md5, caption, image_base64, width, height, combined_data_str] tsv_data.append('\t'.join(map(str, tsv_row))) return tsv_data def write_tsv(tsv_data, output_file): with open(output_file, 'w') as file: file.write('\n'.join(tsv_data)) if __name__ == '__main__': directory = '/tmp/grit' output_file = '/tmp/output.tsv' tsv_data = process_files(directory) write_tsv(tsv_data, output_file)