import os import shutil import glob import random from pprint import pprint DIR_COCO_VG = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw" DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/blip2_pretraining" OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/blip2_all_data_ground" if __name__ == "__main__": os.makedirs(OUT_DIR, exist_ok=True) ccs_tars = glob.glob(os.path.join(DIR, "ccs_synthetic_filtered_large_ground", "*.tar")) coco_tars = glob.glob(os.path.join(DIR_COCO_VG, "karpathy_coco_wds_full_ground", "*.tar")) vg_tars = glob.glob(os.path.join(DIR_COCO_VG, "vg_wds_full_ground", "*.tar")) laion_part_tars = glob.glob(os.path.join(DIR, "laion_synthetic_filtered_large", "all_ground", "*.tar")) tars = [] tars.extend(ccs_tars) for _ in range(5): tars.extend(coco_tars) tars.extend(vg_tars) tars.extend(laion_part_tars) random.shuffle(tars) print(len(tars)) pprint(tars[:20]) for i, tar in enumerate(tars): dst = os.path.join(OUT_DIR, f"{str(i).zfill(6)}.tar") # print(tar, dst) os.symlink(tar, dst)