import os import shutil import glob import random from pprint import pprint DIR_COCO_VG = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw" DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/blip2_pretraining/" OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/blip2_pretraining/laion_synthetic_filtered_large/all" if __name__ == "__main__": os.makedirs(OUT_DIR, exist_ok=True) tars = [] for i in range(10): laion_part_tars = glob.glob(os.path.join(DIR, "laion_synthetic_filtered_large", f"part{i}", "*.tar")) tars.extend(laion_part_tars) print(len(tars)) pprint(tars[:20]) for i, tar in enumerate(tars): dst = os.path.join(OUT_DIR, f"{str(i).zfill(6)}.tar") # print(tar, dst) os.symlink(tar, dst)