import sys from pathlib import Path sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse import os from jean_zay.launch import JeanZayExperiment def parse_mode(): parser = argparse.ArgumentParser( description="Extract embeddings from YFCC dataset using DINOv2" ) parser.add_argument( "--launch", action="store_true", help="Launch the experiment", ) parser.add_argument("--src_csv_dir", help="path to source csv directory") parser.add_argument("--src_images_dir", help="path to source images directory") parser.add_argument("--dest", help="path to destination") parser.add_argument( "--num_samples_per_tar", help="number of samples per tar", type=int, default=10000, ) parser.add_argument("--batch_size", help="batch size", type=int, default=256) args = parser.parse_args() return args args = parse_mode() number_of_jobs = len(list(Path(args.src_csv_dir).glob("*.csv"))) cmd_modifiers = [] exps = [] exp_name = f"yfcc_preprocessing" job_name = f"yfcc_preprocessing" jz_exp = JeanZayExperiment( exp_name, job_name, slurm_array_nb_jobs=number_of_jobs, cmd_path="data/to_webdataset/yfcc_to_wds.py", num_nodes=1, num_gpus_per_node=1, qos="t3", account="syq", gpu_type="a100", time="1:30:00", ) exps.append(jz_exp) trainer_modifiers = {} exp_modifier = { "--src_csv_dir": args.src_csv_dir, "--src_images_dir": args.src_images_dir, "--dest": args.dest, "--num_samples_per_tar": args.num_samples_per_tar, "--job_offset": "${SLURM_ARRAY_TASK_ID}", "--batch_size": args.batch_size, } cmd_modifiers.append(dict(trainer_modifiers, **exp_modifier)) if __name__ == "__main__": for exp, cmd_modifier in zip(exps, cmd_modifiers): exp.build_cmd(cmd_modifier) if args.launch == True: exp.launch()