File size: 982 Bytes
5282eae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import webdataset as wds
import os
from tqdm import tqdm
from PIL import Image
from io import BytesIO
import base64
OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/vqav2_train_wds"
TOTAL = 1828467

if __name__ == "__main__":
    with wds.ShardWriter(os.path.join(OUT_DIR, "%06d.tar"), maxcount=10000) as sink:
        sink.verbose = False
        f = open("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/vqav2_ofa/vqa_train.tsv")
        for data in tqdm(f, total=TOTAL):
            data = data.rstrip().split("\t")
            id1 = data[0]
            id2 = data[1]
            question = data[2]
            answer = data[3].split("|!+")[-1]
            image = data[5]
            id3 = data[6]
            image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
            caption = f"Question: {question.strip()} Answer: {answer.strip()}"
            sink.write({"__key__": f"vqav2_{id1}_{id2}_{id3}", "jpg": image, "txt": caption})