realfakerepo
/

realfake

Model card Files Files and versions Community

devforfu commited on Jun 20, 2023

Commit

c1f3687

•

1 Parent(s): 4334bbd

Movie stills binary classifier

Browse files

Files changed (4) hide show

metadata/movies_plus.jsonl +3 -0
realfake/bin/download_s3.py +43 -18
realfake/utils.py +5 -2
submit_movie.sh +24 -0

metadata/movies_plus.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:750d4828d1d5051b390519fa1e964b45305a5ffae3d1ef50b783568452bc13fa
+size 5992428

realfake/bin/download_s3.py CHANGED Viewed

@@ -1,23 +1,34 @@
-from __future__ import annotations
 import tarfile
 from dataclasses import dataclass
 from pathlib import Path
 import boto3
 from joblib import Parallel, delayed
-from realfake.utils import get_user_name
-def main() -> None:
     bucket, prefix = "s-datasets", "laion-aesthetic/data/laion2B-en-aesthetic/"
-    start_idx, end_idx = 400, 700
     keys_range = list(range(start_idx, end_idx))
     output_dir = Path(f"/fsx/{get_user_name()}/data/real_aes_{start_idx}_{end_idx}")
-    output_dir.mkdir(parents=True, exist_ok=True)
-    jobs = get_jobs(keys_range, bucket, prefix, output_dir)
     Parallel(n_jobs=-1, backend="multiprocessing", verbose=100)(delayed(download_and_extract)(job) for job in jobs)
@@ -29,7 +40,14 @@ class Job:
     output_dir: Path
-def get_jobs(keys_range: list, bucket: str, prefix: str, output_dir: Path) -> list[Job]:
     client = boto3.client("s3")
     token, jobs = None, []
@@ -41,8 +59,10 @@ def get_jobs(keys_range: list, bucket: str, prefix: str, output_dir: Path) -> li
         for item in response.get("Contents"):
             key = Path(item["Key"])
-            if key.suffix == ".tar" and int(key.stem) in keys_range:
                 jobs.append(Job(bucket, key, output_dir))
         if not response["IsTruncated"]: break
         token = response["NextContinuationToken"]
@@ -52,19 +72,24 @@ def get_jobs(keys_range: list, bucket: str, prefix: str, output_dir: Path) -> li
 def download_and_extract(job: Job) -> None:
     client = boto3.client("s3")
-    tar_file = job.output_dir / job.key.name
     print(f"{job.key}: downloading...")
-    client.download_file(job.bucket, str(job.key), tar_file)
-    print(f"{job.key}: extracting...")
-    with tarfile.open(tar_file) as tar:
-        for name in tar.getnames():
-            if name.endswith(".jpg"):
-                tar.extract(name, job.output_dir)
     print(f"{job.key}: done!")
-    tar_file.unlink()
 if __name__ == "__main__":

 import tarfile
 from dataclasses import dataclass
 from pathlib import Path
+from typing import List
 import boto3
 from joblib import Parallel, delayed
+from realfake.utils import get_user_name, inject_args, Args
+class DownloadArgs(Args):
+    start_idx: int = 0
+    end_idx: int = 5247
+    metadata_only: bool = False
+@inject_args
+def main(args: DownloadArgs) -> None:
+    print(args)
     bucket, prefix = "s-datasets", "laion-aesthetic/data/laion2B-en-aesthetic/"
+    start_idx, end_idx = args.start_idx, args.end_idx
     keys_range = list(range(start_idx, end_idx))
     output_dir = Path(f"/fsx/{get_user_name()}/data/real_aes_{start_idx}_{end_idx}")
+    if not args.metadata_only:
+        output_dir.mkdir(parents=True, exist_ok=True)
+    metadata_dir = output_dir.parent/f"{output_dir.name}.metadata"
+    metadata_dir.mkdir(parents=True, exist_ok=True)
+    jobs = get_jobs(keys_range, bucket, prefix, output_dir, metadata_dir, args.metadata_only)
     Parallel(n_jobs=-1, backend="multiprocessing", verbose=100)(delayed(download_and_extract)(job) for job in jobs)
     output_dir: Path
+def get_jobs(
+    keys_range: list,
+    bucket: str,
+    prefix: str,
+    output_dir: Path,
+    metadata_dir: Path,
+    metadata_only: bool,
+) -> List[Job]:
     client = boto3.client("s3")
     token, jobs = None, []
         for item in response.get("Contents"):
             key = Path(item["Key"])
+            if key.suffix == ".tar" and int(key.stem) in keys_range and not metadata_only:
                 jobs.append(Job(bucket, key, output_dir))
+            elif key.suffix == ".parquet" and int(key.stem) in keys_range:
+                jobs.append(Job(bucket, key, metadata_dir))
         if not response["IsTruncated"]: break
         token = response["NextContinuationToken"]
 def download_and_extract(job: Job) -> None:
     client = boto3.client("s3")
+    filename = job.output_dir / job.key.name
     print(f"{job.key}: downloading...")
+    client.download_file(job.bucket, str(job.key), filename)
+    if filename.suffix == ".tar":
+        print(f"{job.key}: extracting...")
+        with tarfile.open(filename) as tar:
+            for name in tar.getnames():
+                extracted_path = job.output_dir/name
+                if extracted_path.exists():
+                    continue
+                if name.endswith(".jpg"):
+                    tar.extract(name, job.output_dir)
+        filename.unlink()
     print(f"{job.key}: done!")
 if __name__ == "__main__":

realfake/utils.py CHANGED Viewed

@@ -122,5 +122,8 @@ def find_latest_checkpoint(dirname: Path) -> Path:
     return latest
-def list_files(dirname: Path, exts: list[str]) -> list:
-    return [fn for fn in Path(dirname).iterdir() for ext in exts if fn.match(f"*.{ext}")]

     return latest
+def list_files(dirname: Path, exts: list[str] | None = None) -> list:
+    files = Path(dirname).iterdir()
+    if not exts:
+        return list(files)
+    return [fn for fn in files for ext in exts if fn.match(f"*.{ext}")]

submit_movie.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash -l
+# SLURM SUBMIT SCRIPT
+#SBATCH --partition=g40
+#SBATCH --nodes=1
+#SBATCH --gpus=8
+#SBATCH --cpus-per-gpu=6
+#SBATCH --job-name=realfake
+#SBATCH --comment=laion
+#SBATCH --signal=SIGUSR1@90
+source "${HOME}/venv/bin/activate"
+export NCCL_DEBUG=INFO
+export PYTHONFAULTHANDLER=1
+export PYTHONPATH="${HOME}/realfake"
+echo "Working directory: `pwd`"
+srun python3 realfake/train_cluster.py \
+    -jf "${HOME}/realfake/metadata/movies_plus.jsonl" \
+    -mn convnext_small -e=40 -fe=40 -bs=128 -wl=1 -fw=0.08 \
+    --acceleratorparams.devices=8 \
+    --acceleratorparams.strategy=ddp_find_unused_parameters_false