File size: 1,014 Bytes
5282eae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import webdataset as wds
import glob
import os
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
import pickle as pkl


def single_thread(filename):
    id_table = {}
    dataset = wds.WebDataset(filename).decode().to_tuple("json")
    for data in dataset:
        data = data[0]
        image_id = data["caption"].split(".")[0]
        image_key = data["key"]
        tarfile = os.path.basename(filename)
        if image_id not in id_table:
            id_table[image_id] = [tarfile, image_key]
    return id_table

if __name__ == "__main__":
    filenames = sorted(glob.glob("/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/mmc4/images/*.tar"))[:16000]
    print("start from", filenames[0])
    print("to", filenames[-1])
    id_tables = process_map(single_thread, filenames, max_workers=64)
    id_table = {}
    for table in tqdm(id_tables):
        id_table.update(table)
    print("total unique image:", len(id_table))
    pkl.dump(id_table, open("mmc4_id_table.pkl", "wb"))
    print("DONE")