Tzktz's picture
Upload 7664 files
6fc683c verified
raw
history blame
1.73 kB
import json
import hashlib
import io
import os
import base64
from PIL import Image
from tqdm import tqdm
def calculate_md5(image):
md5_hash = hashlib.md5()
with io.BytesIO() as output:
image.save(output, format='JPEG')
image_data = output.getvalue()
md5_hash.update(image_data)
return md5_hash.hexdigest()
def process_files(directory):
tsv_data = []
for file in tqdm(os.listdir(directory)):
if file.endswith('.json'):
json_path = os.path.join(directory, file)
jpg_path = os.path.join(directory, file.replace('.json', '.jpg'))
with open(json_path, 'r') as json_file:
data = json.load(json_file)
image = Image.open(jpg_path)
md5 = calculate_md5(image)
caption = data['caption']
width = data['width']
height = data['height']
with io.BytesIO() as buffer:
image.save(buffer, format='JPEG')
image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
combined_data_str = {'phrase': data['noun_chunks'], 'expression_v1': data['ref_exps']}
tsv_row = [md5, caption, image_base64, width, height, combined_data_str]
tsv_data.append('\t'.join(map(str, tsv_row)))
return tsv_data
def write_tsv(tsv_data, output_file):
with open(output_file, 'w') as file:
file.write('\n'.join(tsv_data))
if __name__ == '__main__':
directory = '/tmp/grit'
output_file = '/tmp/output.tsv'
tsv_data = process_files(directory)
write_tsv(tsv_data, output_file)