Spaces:
Sleeping
Sleeping
import json | |
import hashlib | |
import io | |
import os | |
import base64 | |
from PIL import Image | |
from tqdm import tqdm | |
def calculate_md5(image): | |
md5_hash = hashlib.md5() | |
with io.BytesIO() as output: | |
image.save(output, format='JPEG') | |
image_data = output.getvalue() | |
md5_hash.update(image_data) | |
return md5_hash.hexdigest() | |
def process_files(directory): | |
tsv_data = [] | |
for file in tqdm(os.listdir(directory)): | |
if file.endswith('.json'): | |
json_path = os.path.join(directory, file) | |
jpg_path = os.path.join(directory, file.replace('.json', '.jpg')) | |
with open(json_path, 'r') as json_file: | |
data = json.load(json_file) | |
image = Image.open(jpg_path) | |
md5 = calculate_md5(image) | |
caption = data['caption'] | |
width = data['width'] | |
height = data['height'] | |
with io.BytesIO() as buffer: | |
image.save(buffer, format='JPEG') | |
image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8") | |
combined_data_str = {'phrase': data['noun_chunks'], 'expression_v1': data['ref_exps']} | |
tsv_row = [md5, caption, image_base64, width, height, combined_data_str] | |
tsv_data.append('\t'.join(map(str, tsv_row))) | |
return tsv_data | |
def write_tsv(tsv_data, output_file): | |
with open(output_file, 'w') as file: | |
file.write('\n'.join(tsv_data)) | |
if __name__ == '__main__': | |
directory = '/tmp/grit' | |
output_file = '/tmp/output.tsv' | |
tsv_data = process_files(directory) | |
write_tsv(tsv_data, output_file) | |