Spaces:

adaface-neurips
/

adaface-animate

Running on Zero

App Files Files Community

adaface-neurips commited on 7 days ago

Commit

8ee7393

1 Parent(s): 76ccb95

update code

Browse files

Files changed (31) hide show

.gitignore +2 -0
ConsistentID/app.py +2 -2
ConsistentID/requirements.txt +1 -1
README2.md +2 -2
adaface/__init__.py +0 -0
adaface/adaface_infer.py +17 -22
adaface/adaface_translate.py +59 -36
adaface/adaface_wrapper.py +366 -72
adaface/diffusers_attn_lora_capture.py +656 -0
adaface/face_id_to_ada_prompt.py +270 -119
adaface/subj_basis_generator.py +97 -59
adaface/unet_teachers.py +86 -49
adaface/util.py +21 -18
animatediff/sd/.gitattributes +0 -35
animatediff/sd/feature_extractor/preprocessor_config.json +0 -20
animatediff/sd/model_index.json +0 -32
animatediff/sd/safety_checker/config.json +0 -175
animatediff/sd/scheduler/scheduler_config.json +0 -13
animatediff/sd/text_encoder/config.json +0 -25
animatediff/sd/tokenizer/merges.txt +0 -0
animatediff/sd/tokenizer/special_tokens_map.json +0 -24
animatediff/sd/tokenizer/tokenizer_config.json +0 -34
animatediff/sd/tokenizer/vocab.json +0 -0
animatediff/sd/unet/config.json +0 -36
animatediff/sd/v1-inference.yaml +0 -70
animatediff/sd/vae/config.json +0 -29
animatediff/utils/convert_from_ckpt.py +1 -1
app.py +123 -105
faceadapter/face_adapter.py +2 -2
infer.py +4 -3
requirements.txt +1 -1

.gitignore CHANGED Viewed

@@ -6,3 +6,5 @@ gradio_cached_examples/
 samples/*
 samples/
 .gradio/certificate.pem

 samples/*
 samples/
 .gradio/certificate.pem
+models/*
+models

ConsistentID/app.py CHANGED Viewed

@@ -26,8 +26,8 @@ pipe = ConsistentIDPipeline.from_pretrained(
 ### Load consistentID_model checkpoint
 pipe.load_ConsistentID_model(
-    consistentID_weight_path="./models/ConsistentID-v1.bin",
-    bise_net_weight_path="./models/BiSeNet_pretrained_for_ConsistentID.pth",
 )
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 pipe = pipe.to(device, torch.float16)

 ### Load consistentID_model checkpoint
 pipe.load_ConsistentID_model(
+    consistentID_weight_path="./models/ConsistentID/ConsistentID-v1.bin",
+    bise_net_weight_path="./models/ConsistentID/BiSeNet_pretrained_for_ConsistentID.pth",
 )
 pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 pipe = pipe.to(device, torch.float16)

ConsistentID/requirements.txt CHANGED Viewed

@@ -7,7 +7,7 @@ peft
 opencv-python
 insightface
 diffusers
-torch
 torchvision
 transformers
 spaces

 opencv-python
 insightface
 diffusers
+torch==2.4.1
 torchvision
 transformers
 spaces

README2.md CHANGED Viewed

@@ -187,9 +187,9 @@ To exclude the effects of AdaFace, we generate a subset of videos with AdaFace-A
 ## Installation
 ### Manually Download Model Checkpoints
-- Download Stable Diffusion V1.5 into ``animatediff/sd``:
-  ``git clone https://huggingface.co/runwayml/stable-diffusion-v1-5 animatediff/sd``
 - Download AnimateDiff motion module into ``models/v3_sd15_mm.ckpt``: https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_mm.ckpt
 - Download Animatediff adapter into ``models/v3_adapter_sd_v15.ckpt``:  https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_adapter.ckpt
 - Download ID-Animator checkpoint into ``models/animator.ckpt`` from: https://huggingface.co/spaces/ID-Animator/ID-Animator/blob/main/animator.ckpt

 ## Installation
 ### Manually Download Model Checkpoints
+- Download Stable Diffusion V1.5 into ``models/animatediff/sd``:
+  ``git clone https://huggingface.co/runwayml/stable-diffusion-v1-5 models/animatediff/sd``
 - Download AnimateDiff motion module into ``models/v3_sd15_mm.ckpt``: https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_mm.ckpt
 - Download Animatediff adapter into ``models/v3_adapter_sd_v15.ckpt``:  https://huggingface.co/guoyww/animatediff/blob/main/v3_sd15_adapter.ckpt
 - Download ID-Animator checkpoint into ``models/animator.ckpt`` from: https://huggingface.co/spaces/ID-Animator/ID-Animator/blob/main/animator.ckpt

adaface/__init__.py ADDED Viewed

File without changes

adaface/adaface_infer.py CHANGED Viewed

@@ -41,42 +41,36 @@ def seed_everything(seed):
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--pipeline", type=str, default="text2img",
-                        choices=["text2img", "img2img", "text2img3", "flux"],
                         help="Type of pipeline to use (default: txt2img)")
     parser.add_argument("--base_model_path", type=str, default=None,
                         help="Type of checkpoints to use (default: None, using the official model)")
-    parser.add_argument('--adaface_ckpt_paths', type=str, nargs="+",
-                        default=['models/adaface/subjects-celebrity2024-05-16T17-22-46_zero3-ada-30000.pt'])
-    parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["arc2face"],
                         choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
     # If adaface_encoder_cfg_scales is not specified, the weights will be set to 6.0 (consistentID) and 1.0 (arc2face).
     parser.add_argument('--adaface_encoder_cfg_scales', type=float, nargs="+", default=None,
                         help="CFG scales of output embeddings of the ID2Ada prompt encoders")
     parser.add_argument("--main_unet_filepath", type=str, default=None,
                         help="Path to the checkpoint of the main UNet model, if you want to replace the default UNet within --base_model_path")
     parser.add_argument("--extra_unet_dirpaths", type=str, nargs="*",
-                        default=['models/ensemble/rv4-unet', 'models/ensemble/ar18-unet'],
                         help="Extra paths to the checkpoints of the UNet models")
-    parser.add_argument('--unet_weights', type=float, nargs="+", default=[4, 2, 1],
                         help="Weights for the UNet models")
     parser.add_argument("--subject", type=str)
     parser.add_argument("--example_image_count", type=int, default=-1, help="Number of example images to use")
     parser.add_argument("--out_image_count",     type=int, default=4,  help="Number of images to generate")
     parser.add_argument("--prompt", type=str, default="a woman z in superman costume")
-    parser.add_argument("--noise", dest='perturb_std', type=float, default=0)
     parser.add_argument("--randface", action="store_true")
     parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
                         help="Guidance scale for the diffusion model")
-    parser.add_argument("--id_cfg_scale", type=float, default=6,
-                        help="CFG scale when generating the identity embeddings")
-    parser.add_argument("--subject_string",
-                        type=str, default="z",
-                        help="Subject placeholder string used in prompts to denote the concept.")
     parser.add_argument("--num_images_per_row", type=int, default=4,
                         help="Number of images to display in a row in the output grid image.")
-    parser.add_argument("--num_inference_steps", type=int, default=50,
-                        help="Number of DDIM inference steps")
     parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
     parser.add_argument("--seed", type=int, default=42,
                         help="the seed (for reproducible sampling). Set to -1 to disable.")
@@ -95,16 +89,15 @@ if __name__ == "__main__":
     if args.pipeline not in ["text2img", "img2img"]:
         args.extra_unet_dirpaths = None
-        args.unet_weights = None
     adaface = AdaFaceWrapper(args.pipeline, args.base_model_path,
-                             args.adaface_encoder_types, args.adaface_ckpt_paths,
-                             args.adaface_encoder_cfg_scales,
-                             args.subject_string, args.num_inference_steps,
                              unet_types=None,
                              main_unet_filepath=args.main_unet_filepath,
                              extra_unet_dirpaths=args.extra_unet_dirpaths,
-                             unet_weights=args.unet_weights, device=args.device)
     if not args.randface:
         image_folder = args.subject
@@ -143,7 +136,7 @@ if __name__ == "__main__":
     rand_init_id_embs = torch.randn(1, 512)
     init_id_embs = rand_init_id_embs if args.randface else None
-    noise = torch.randn(args.out_image_count, 4, 64, 64).cuda()
     # args.perturb_std: the *relative* std of the noise added to the face embeddings.
     # A noise level of 0.08 could change gender, but 0.06 is usually safe.
     # adaface_subj_embs is not used. It is generated for the purpose of updating the text encoder (within this function call).
@@ -151,5 +144,7 @@ if __name__ == "__main__":
         adaface.prepare_adaface_embeddings(image_paths, init_id_embs,
                                            perturb_at_stage='img_prompt_emb',
                                            perturb_std=args.perturb_std, update_text_encoder=True)
-    images = adaface(noise, args.prompt, None, 'append', args.guidance_scale, args.out_image_count, verbose=True)
     save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.perturb_std)

 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--pipeline", type=str, default="text2img",
+                        choices=["text2img", "text2imgxl", "img2img", "text2img3", "flux"],
                         help="Type of pipeline to use (default: txt2img)")
     parser.add_argument("--base_model_path", type=str, default=None,
                         help="Type of checkpoints to use (default: None, using the official model)")
+    parser.add_argument('--adaface_ckpt_path', type=str, required=True)
+    parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                         choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
+    parser.add_argument("--enabled_encoders", type=str, nargs="+", default=None,
+                        choices=["arc2face", "consistentID"],
+                        help="List of enabled encoders (among the list of adaface_encoder_types). Default: None (all enabled)")
     # If adaface_encoder_cfg_scales is not specified, the weights will be set to 6.0 (consistentID) and 1.0 (arc2face).
     parser.add_argument('--adaface_encoder_cfg_scales', type=float, nargs="+", default=None,
                         help="CFG scales of output embeddings of the ID2Ada prompt encoders")
     parser.add_argument("--main_unet_filepath", type=str, default=None,
                         help="Path to the checkpoint of the main UNet model, if you want to replace the default UNet within --base_model_path")
     parser.add_argument("--extra_unet_dirpaths", type=str, nargs="*",
+                        default=[],
                         help="Extra paths to the checkpoints of the UNet models")
+    parser.add_argument('--unet_weights_in_ensemble', type=float, nargs="+", default=[1],
                         help="Weights for the UNet models")
     parser.add_argument("--subject", type=str)
     parser.add_argument("--example_image_count", type=int, default=-1, help="Number of example images to use")
     parser.add_argument("--out_image_count",     type=int, default=4,  help="Number of images to generate")
     parser.add_argument("--prompt", type=str, default="a woman z in superman costume")
+    parser.add_argument("--perturb_std", type=float, default=0)
     parser.add_argument("--randface", action="store_true")
     parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
                         help="Guidance scale for the diffusion model")
     parser.add_argument("--num_images_per_row", type=int, default=4,
                         help="Number of images to display in a row in the output grid image.")
     parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
     parser.add_argument("--seed", type=int, default=42,
                         help="the seed (for reproducible sampling). Set to -1 to disable.")
     if args.pipeline not in ["text2img", "img2img"]:
         args.extra_unet_dirpaths = None
+        args.unet_weights_in_ensemble = None
     adaface = AdaFaceWrapper(args.pipeline, args.base_model_path,
+                             args.adaface_encoder_types, args.adaface_ckpt_path,
+                             args.adaface_encoder_cfg_scales, args.enabled_encoders,
                              unet_types=None,
                              main_unet_filepath=args.main_unet_filepath,
                              extra_unet_dirpaths=args.extra_unet_dirpaths,
+                             unet_weights_in_ensemble=args.unet_weights_in_ensemble, device=args.device)
     if not args.randface:
         image_folder = args.subject
     rand_init_id_embs = torch.randn(1, 512)
     init_id_embs = rand_init_id_embs if args.randface else None
+    init_noise = torch.randn(args.out_image_count, 4, 64, 64).cuda()
     # args.perturb_std: the *relative* std of the noise added to the face embeddings.
     # A noise level of 0.08 could change gender, but 0.06 is usually safe.
     # adaface_subj_embs is not used. It is generated for the purpose of updating the text encoder (within this function call).
         adaface.prepare_adaface_embeddings(image_paths, init_id_embs,
                                            perturb_at_stage='img_prompt_emb',
                                            perturb_std=args.perturb_std, update_text_encoder=True)
+    images = adaface(init_noise, args.prompt, None, None,
+                     'append', args.guidance_scale,
+                     args.out_image_count, verbose=True)
     save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.perturb_std)

adaface/adaface_translate.py CHANGED Viewed

@@ -25,21 +25,25 @@ def seed_everything(seed):
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--base_model_path", type=str, default='models/realisticvision/realisticVisionV40_v40VAE.safetensors',
-                        help="Path to the UNet checkpoint (default: RealisticVision 4.0)")
-    parser.add_argument('--adaface_ckpt_paths', type=str, nargs="+",
-                        default=['models/adaface/subjects-celebrity2024-05-16T17-22-46_zero3-ada-30000.pt'])
-    parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["arc2face"],
                         choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
     # If adaface_encoder_cfg_scales is not specified, the weights will be set to 6.0 (consistentID) and 1.0 (arc2face).
     parser.add_argument('--adaface_encoder_cfg_scales', type=float, nargs="+", default=None,
                         help="CFG scales of output embeddings of the ID2Ada prompt encoders")
     parser.add_argument('--extra_unet_dirpaths', type=str, nargs="*",
-                        default=['models/ensemble/rv4-unet', 'models/ensemble/ar18-unet'],
                         help="Extra paths to the checkpoints of the UNet models")
-    parser.add_argument('--unet_weights', type=float, nargs="+", default=[4, 2, 1],
                         help="Weights for the UNet models")
     parser.add_argument("--in_folder",  type=str, required=True, help="Path to the folder containing input images")
     # If True, the input folder contains images of mixed subjects.
     # If False, the input folder contains multiple subfolders, each of which contains images of the same subject.
     parser.add_argument("--is_mix_subj_folder", type=str2bool, const=True, default=False, nargs="?",
@@ -49,19 +53,14 @@ def parse_args():
     parser.add_argument("--out_folder", type=str, required=True, help="Path to the folder saving output images")
     parser.add_argument("--out_count_per_input_image", type=int, default=1,  help="Number of output images to generate per input image")
     parser.add_argument("--copy_masks", action="store_true", help="Copy the mask images to the output folder")
-    parser.add_argument("--noise", dest='perturb_std', type=float, default=0)
     parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
                         help="Guidance scale for the diffusion model")
     parser.add_argument("--ref_img_strength", type=float, default=0.8,
                         help="Strength of the reference image in the output image.")
-    parser.add_argument("--subject_string",
-                        type=str, default="z",
-                        help="Subject placeholder string used in prompts to denote the concept.")
     parser.add_argument("--prompt", type=str, default="a person z")
     parser.add_argument("--num_images_per_row", type=int, default=4,
                         help="Number of images to display in a row in the output grid image.")
-    parser.add_argument("--num_inference_steps", type=int, default=50,
-                        help="Number of DDIM inference steps")
     parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use. If num_gpus > 1, use accelerate for distributed execution.")
     parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
     parser.add_argument("--seed", type=int, default=42,
@@ -90,15 +89,16 @@ if __name__ == "__main__":
         process_index = 0
     adaface = AdaFaceWrapper("img2img", args.base_model_path,
-                             args.adaface_encoder_types, args.adaface_ckpt_paths,
-                             args.adaface_encoder_cfg_scales,
-                             args.subject_string, args.num_inference_steps,
                              unet_types=None,
-                             extra_unet_dirpaths=args.extra_unet_dirpaths, unet_weights=args.unet_weights,
                              device=args.device)
     in_folder = args.in_folder
     if os.path.isfile(in_folder):
         subject_folders = [ os.path.dirname(in_folder) ]
         images_by_subject = [[in_folder]]
     else:
@@ -154,6 +154,24 @@ if __name__ == "__main__":
         images_by_subject = images_by_subject[process_index::args.num_gpus]
         #subject_folders, images_by_subject = distributed_state.split_between_processes(zip(subject_folders, images_by_subject))
     for (subject_folder, image_paths) in zip(subject_folders, images_by_subject):
         # If is_mix_subj_folder, then image_paths only contains 1 image, and we use the file name as the signature of the image.
         # Otherwise, we use the folder name as the signature of the images.
@@ -173,29 +191,32 @@ if __name__ == "__main__":
             os.makedirs(subject_out_folder)
         print(f"Output images will be saved to {subject_out_folder}")
-        in_images = []
-        for image_path in image_paths:
-            image = Image.open(image_path).convert("RGB").resize((512, 512))
-            # [512, 512, 3] -> [3, 512, 512].
-            image = np.array(image).transpose(2, 0, 1)
-            # Convert the image to a tensor of shape (1, 3, 512, 512) and move it to the GPU.
-            image = torch.tensor(image).unsqueeze(0).float().cuda()
-            in_images.append(image)
-        # Put all input images of the subject into a batch. This assumes max_images_per_subject is small.
-        # NOTE: For simplicity, we do not check overly large batch sizes.
-        in_images = torch.cat(in_images, dim=0)
-        # in_images: [5, 3, 512, 512].
-        # Normalize the pixel values to [0, 1].
-        in_images = in_images / 255.0
-        num_out_images = len(in_images) * args.out_count_per_input_image
         with torch.no_grad():
             # args.perturb_std: the *relative* std of the noise added to the face embeddings.
             # A noise level of 0.08 could change gender, but 0.06 is usually safe.
             # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
             # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
-            out_images = adaface(in_images, args.prompt, None, 'append', args.guidance_scale, num_out_images, ref_img_strength=args.ref_img_strength)
             for img_i, img in enumerate(out_images):
                 # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...
@@ -203,9 +224,11 @@ if __name__ == "__main__":
                 copy_i = img_i // len(in_images)
                 image_filename_stem, image_fileext = os.path.splitext(os.path.basename(image_paths[subj_i]))
                 if copy_i == 0:
-                    img.save(os.path.join(subject_out_folder, f"{image_filename_stem}{image_fileext}"))
                 else:
-                    img.save(os.path.join(subject_out_folder, f"{image_filename_stem}_{copy_i}{image_fileext}"))
                 if args.copy_masks:
                     mask_path = image_paths[subj_i].replace(image_fileext, "_mask.png")

 def parse_args():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model_path", type=str, default='models/sar/sar.safetensors',
+                        help="Path to the UNet checkpoint (Default: SAR)")
+    parser.add_argument('--adaface_ckpt_path', type=str, required=True)
+    parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                         choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
+    parser.add_argument("--enabled_encoders", type=str, nargs="+", default=None,
+                        choices=["arc2face", "consistentID"],
+                        help="List of enabled encoders (among the list of adaface_encoder_types). Default: None (all enabled)")
     # If adaface_encoder_cfg_scales is not specified, the weights will be set to 6.0 (consistentID) and 1.0 (arc2face).
     parser.add_argument('--adaface_encoder_cfg_scales', type=float, nargs="+", default=None,
                         help="CFG scales of output embeddings of the ID2Ada prompt encoders")
     parser.add_argument('--extra_unet_dirpaths', type=str, nargs="*",
+                        default=[],
                         help="Extra paths to the checkpoints of the UNet models")
+    parser.add_argument('--unet_weights_in_ensemble', type=float, nargs="+", default=[1],
                         help="Weights for the UNet models")
     parser.add_argument("--in_folder",  type=str, required=True, help="Path to the folder containing input images")
+    parser.add_argument("--restore_image", type=str, default=None,
+                        help="Path to the image to be restored")
     # If True, the input folder contains images of mixed subjects.
     # If False, the input folder contains multiple subfolders, each of which contains images of the same subject.
     parser.add_argument("--is_mix_subj_folder", type=str2bool, const=True, default=False, nargs="?",
     parser.add_argument("--out_folder", type=str, required=True, help="Path to the folder saving output images")
     parser.add_argument("--out_count_per_input_image", type=int, default=1,  help="Number of output images to generate per input image")
     parser.add_argument("--copy_masks", action="store_true", help="Copy the mask images to the output folder")
+    parser.add_argument("--perturb_std", type=float, default=0)
     parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
                         help="Guidance scale for the diffusion model")
     parser.add_argument("--ref_img_strength", type=float, default=0.8,
                         help="Strength of the reference image in the output image.")
     parser.add_argument("--prompt", type=str, default="a person z")
     parser.add_argument("--num_images_per_row", type=int, default=4,
                         help="Number of images to display in a row in the output grid image.")
     parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use. If num_gpus > 1, use accelerate for distributed execution.")
     parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
     parser.add_argument("--seed", type=int, default=42,
         process_index = 0
     adaface = AdaFaceWrapper("img2img", args.base_model_path,
+                             args.adaface_encoder_types, args.adaface_ckpt_path,
+                             args.adaface_encoder_cfg_scales, args.enabled_encoders,
                              unet_types=None,
+                             extra_unet_dirpaths=args.extra_unet_dirpaths,
+                             unet_weights_in_ensemble=args.unet_weights_in_ensemble,
                              device=args.device)
     in_folder = args.in_folder
     if os.path.isfile(in_folder):
+        args.in_folder = os.path.dirname(args.in_folder)
         subject_folders = [ os.path.dirname(in_folder) ]
         images_by_subject = [[in_folder]]
     else:
         images_by_subject = images_by_subject[process_index::args.num_gpus]
         #subject_folders, images_by_subject = distributed_state.split_between_processes(zip(subject_folders, images_by_subject))
+    if args.restore_image is not None:
+        in_images = []
+        for image_path in [args.restore_image]:
+            image = Image.open(image_path).convert("RGB").resize((512, 512))
+            # [512, 512, 3] -> [3, 512, 512].
+            image = np.array(image).transpose(2, 0, 1)
+            # Convert the image to a tensor of shape (1, 3, 512, 512) and move it to the GPU.
+            image = torch.tensor(image).unsqueeze(0).float().cuda()
+            in_images.append(image)
+        # Put all input images of the subject into a batch. This assumes max_images_per_subject is small.
+        # NOTE: For simplicity, we do not check overly large batch sizes.
+        in_images = torch.cat(in_images, dim=0)
+        # in_images: [5, 3, 512, 512].
+        # Normalize the pixel values to [0, 1].
+        in_images = in_images / 255.0
+        num_out_images = len(in_images) * args.out_count_per_input_image
     for (subject_folder, image_paths) in zip(subject_folders, images_by_subject):
         # If is_mix_subj_folder, then image_paths only contains 1 image, and we use the file name as the signature of the image.
         # Otherwise, we use the folder name as the signature of the images.
             os.makedirs(subject_out_folder)
         print(f"Output images will be saved to {subject_out_folder}")
+        if args.restore_image is None:
+            in_images = []
+            for image_path in image_paths:
+                image = Image.open(image_path).convert("RGB").resize((512, 512))
+                # [512, 512, 3] -> [3, 512, 512].
+                image = np.array(image).transpose(2, 0, 1)
+                # Convert the image to a tensor of shape (1, 3, 512, 512) and move it to the GPU.
+                image = torch.tensor(image).unsqueeze(0).float().cuda()
+                in_images.append(image)
+            # Put all input images of the subject into a batch. This assumes max_images_per_subject is small.
+            # NOTE: For simplicity, we do not check overly large batch sizes.
+            in_images = torch.cat(in_images, dim=0)
+            # in_images: [5, 3, 512, 512].
+            # Normalize the pixel values to [0, 1].
+            in_images = in_images / 255.0
+            num_out_images = len(in_images) * args.out_count_per_input_image
         with torch.no_grad():
             # args.perturb_std: the *relative* std of the noise added to the face embeddings.
             # A noise level of 0.08 could change gender, but 0.06 is usually safe.
             # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
             # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
+            out_images = adaface(in_images, args.prompt, None, None,
+                                 'append', args.guidance_scale, num_out_images,
+                                 ref_img_strength=args.ref_img_strength)
             for img_i, img in enumerate(out_images):
                 # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...
                 copy_i = img_i // len(in_images)
                 image_filename_stem, image_fileext = os.path.splitext(os.path.basename(image_paths[subj_i]))
                 if copy_i == 0:
+                    save_path = os.path.join(subject_out_folder, f"{image_filename_stem}{image_fileext}")
                 else:
+                    save_path = os.path.join(subject_out_folder, f"{image_filename_stem}_{copy_i}{image_fileext}")
+                img.save(save_path)
+                print(f"Saved {save_path}")
                 if args.copy_masks:
                     mask_path = image_paths[subj_i].replace(image_fileext, "_mask.png")

adaface/adaface_wrapper.py CHANGED Viewed

@@ -8,22 +8,29 @@ from diffusers import (
     StableDiffusion3Pipeline,
     #FluxPipeline,
     DDIMScheduler,
     AutoencoderKL,
 )
 from diffusers.loaders.single_file_utils import convert_ldm_unet_checkpoint
 from adaface.util import UNetEnsemble
 from adaface.face_id_to_ada_prompt import create_id2ada_prompt_encoder
 from safetensors.torch import load_file as safetensors_load_file
 import re, os
 import numpy as np
 class AdaFaceWrapper(nn.Module):
     def __init__(self, pipeline_name, base_model_path, adaface_encoder_types,
                  adaface_ckpt_paths, adaface_encoder_cfg_scales=None,
-                 enabled_encoders=None,
-                 subject_string='z', num_inference_steps=50, negative_prompt=None,
                  use_840k_vae=False, use_ds_text_encoder=False,
-                 main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights=None,
                  device='cuda', is_training=False):
         '''
         pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
@@ -38,15 +45,23 @@ class AdaFaceWrapper(nn.Module):
         self.adaface_ckpt_paths = adaface_ckpt_paths
         self.adaface_encoder_cfg_scales = adaface_encoder_cfg_scales
         self.enabled_encoders = enabled_encoders
         self.subject_string = subject_string
-        self.num_inference_steps = num_inference_steps
         self.use_840k_vae = use_840k_vae
         self.use_ds_text_encoder = use_ds_text_encoder
         self.main_unet_filepath = main_unet_filepath
         self.unet_types = unet_types
         self.extra_unet_dirpaths = extra_unet_dirpaths
-        self.unet_weights = unet_weights
         self.device = device
         self.is_training = is_training
@@ -62,7 +77,14 @@ class AdaFaceWrapper(nn.Module):
         self.initialize_pipeline()
         # During inference, we never use static image suffix embeddings.
         # So num_id_vecs is the length of the returned adaface embeddings for each encoder.
-        self.encoders_num_id_vecs = self.id2ada_prompt_encoder.encoders_num_id_vecs
         self.extend_tokenizer_and_text_encoder()
     def to(self, device):
@@ -76,7 +98,8 @@ class AdaFaceWrapper(nn.Module):
         self.id2ada_prompt_encoder = create_id2ada_prompt_encoder(self.adaface_encoder_types,
                                                                   self.adaface_ckpt_paths,
                                                                   self.adaface_encoder_cfg_scales,
-                                                                  self.enabled_encoders)
         self.id2ada_prompt_encoder.to(self.device)
         print(f"adaface_encoder_cfg_scales: {self.adaface_encoder_cfg_scales}")
@@ -118,10 +141,10 @@ class AdaFaceWrapper(nn.Module):
         if self.base_model_path is None:
             base_model_path_dict = {
-                'text2img':  'models/sd15-dste8-vae.safetensors',
-                'text2imgxl': 'stabilityai/stable-diffusion-xl-base-1.0',
-                'text2img3': 'stabilityai/stable-diffusion-3-medium-diffusers',
-                'flux':      'black-forest-labs/FLUX.1-schnell',
             }
             self.base_model_path = base_model_path_dict[self.pipeline_name]
@@ -137,6 +160,20 @@ class AdaFaceWrapper(nn.Module):
                     safety_checker=None
                 )
         if self.main_unet_filepath is not None:
             print(f"Replacing the UNet with the UNet from {self.main_unet_filepath}.")
             ret = pipeline.unet.load_state_dict(self.load_unet_from_file(self.main_unet_filepath, device='cpu'))
@@ -147,12 +184,19 @@ class AdaFaceWrapper(nn.Module):
         if (self.unet_types is not None and len(self.unet_types) > 0) \
           or (self.extra_unet_dirpaths is not None and len(self.extra_unet_dirpaths) > 0):
-            unet_ensemble = UNetEnsemble([pipeline.unet], self.unet_types, self.extra_unet_dirpaths, self.unet_weights,
                                          device=self.device, torch_dtype=torch.float16)
             pipeline.unet = unet_ensemble
         print(f"Loaded pipeline from {self.base_model_path}.")
         if self.use_840k_vae:
             pipeline.vae = vae
             print("Replaced the VAE with the 840k-step VAE.")
@@ -167,19 +211,56 @@ class AdaFaceWrapper(nn.Module):
             pipeline.vae  = None
             print("Removed UNet and VAE from the pipeline.")
-        if self.pipeline_name not in ["text2imgxl", "text2img3", "flux"]:
-            noise_scheduler = DDIMScheduler(
-                num_train_timesteps=1000,
-                beta_start=0.00085,
-                beta_end=0.012,
-                beta_schedule="scaled_linear",
-                clip_sample=False,
-                set_alpha_to_one=False,
-            )
             pipeline.scheduler = noise_scheduler
-        # Otherwise, pipeline.scheduler == FlowMatchEulerDiscreteScheduler
         self.pipeline = pipeline.to(self.device)
     def load_unet_from_file(self, unet_path, device=None):
         if os.path.isfile(unet_path):
             if unet_path.endswith(".safetensors"):
@@ -208,7 +289,109 @@ class AdaFaceWrapper(nn.Module):
         else:
             raise ValueError(f"UNet path {unet_path} is not a file.")
         return unet_state_dict
     def extend_tokenizer_and_text_encoder(self):
         if np.sum(self.encoders_num_id_vecs) < 1:
             raise ValueError(f"encoders_num_id_vecs has to be larger or equal to 1, but is {self.encoders_num_id_vecs}")
@@ -218,6 +401,7 @@ class AdaFaceWrapper(nn.Module):
         # We add z_0_0, z_0_1, z_0_2, ..., z_0_15, z_1_0, z_1_1, z_1_2, z_1_3 to the tokenizer.
         self.all_placeholder_tokens = []
         self.placeholder_tokens_strs = []
         for i in range(len(self.adaface_encoder_types)):
             placeholder_tokens = []
             for j in range(self.encoders_num_id_vecs[i]):
@@ -225,9 +409,11 @@ class AdaFaceWrapper(nn.Module):
                 placeholder_tokens_str = " ".join(placeholder_tokens)
             self.all_placeholder_tokens.extend(placeholder_tokens)
             self.placeholder_tokens_strs.append(placeholder_tokens_str)
         self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
         # all_null_placeholder_tokens_str: ", , , , ..." (20 times).
         # It just contains the commas and spaces with the same length, but no actual tokens.
         self.all_null_placeholder_tokens_str = " ".join([", "] * len(self.all_placeholder_tokens))
@@ -241,7 +427,7 @@ class AdaFaceWrapper(nn.Module):
         print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")
-        # placeholder_token_ids: [49408, ..., 49423].
         self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
         #print("New tokens:", self.placeholder_token_ids)
         # Resize the token embeddings as we are adding new special tokens to the tokenizer
@@ -252,24 +438,49 @@ class AdaFaceWrapper(nn.Module):
     # Extend pipeline.text_encoder with the adaface subject emeddings.
     # subj_embs: [16, 768].
-    def update_text_encoder_subj_embeddings(self, subj_embs):
         # Initialise the newly added placeholder token with the embeddings of the initializer token
         # token_embeds: [49412, 768]
         token_embeds = self.pipeline.text_encoder.get_input_embeddings().weight.data
         with torch.no_grad():
-            for i, token_id in enumerate(self.placeholder_token_ids):
-                token_embeds[token_id] = subj_embs[i]
-            print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.all_placeholder_tokens_str}) in the text encoder.")
     def update_prompt(self, prompt, placeholder_tokens_pos='append',
                       use_null_placeholders=False):
         if prompt is None:
             prompt = ""
         if use_null_placeholders:
             all_placeholder_tokens_str = self.all_null_placeholder_tokens_str
         else:
-            all_placeholder_tokens_str = self.all_placeholder_tokens_str
         # Delete the subject_string from the prompt.
         prompt = re.sub(r'\b(a|an|the)\s+' + self.subject_string + r'\b,?', "", prompt)
@@ -279,15 +490,29 @@ class AdaFaceWrapper(nn.Module):
         # When we do joint training, seems both work better if they are appended to the prompt.
         # Therefore we simply appended all placeholder_tokens_str's to the prompt.
         # NOTE: Prepending them hurts compositional prompts.
-        if placeholder_tokens_pos == 'prepend':
-            prompt = all_placeholder_tokens_str + " " + prompt
-        elif placeholder_tokens_pos == 'append':
-            prompt = prompt + " " + all_placeholder_tokens_str
         else:
-            breakpoint()
         return prompt
     # If face_id_embs is None, then it extracts face_id_embs from the images,
     # then map them to ada prompt embeddings.
     # avg_at_stage: 'id_emb', 'img_prompt_emb', or None.
@@ -298,27 +523,29 @@ class AdaFaceWrapper(nn.Module):
                                    perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
                                    perturb_std=0, update_text_encoder=True):
-        all_adaface_subj_embs = \
             self.id2ada_prompt_encoder.generate_adaface_embeddings(\
                     image_paths, face_id_embs=face_id_embs,
                     img_prompt_embs=None,
                     avg_at_stage=avg_at_stage,
                     perturb_at_stage=perturb_at_stage,
                     perturb_std=perturb_std,
-                    enable_static_img_suffix_embs=False)
         if all_adaface_subj_embs is None:
             return None
         if all_adaface_subj_embs.ndim == 4:
-            # [1, 1, 16, 768] -> [16, 768]
             all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0).squeeze(0)
         elif all_adaface_subj_embs.ndim == 3:
-            # [1, 16, 768] -> [16, 768]
             all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0)
         if update_text_encoder:
-            self.update_text_encoder_subj_embeddings(all_adaface_subj_embs)
         return all_adaface_subj_embs
     def diffusers_encode_prompts(self, prompt, plain_prompt, negative_prompt, device):
@@ -368,6 +595,7 @@ class AdaFaceWrapper(nn.Module):
                 else:
                     breakpoint()
             else:
                 # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
                 prompt_embeds_, negative_prompt_embeds_ = \
                     self.pipeline.encode_prompt(prompt, device=device,
@@ -378,9 +606,53 @@ class AdaFaceWrapper(nn.Module):
         return prompt_embeds_, negative_prompt_embeds_, \
                pooled_prompt_embeds_, negative_pooled_prompt_embeds_
     def encode_prompt(self, prompt, negative_prompt=None,
                       placeholder_tokens_pos='append',
-                      do_neg_id_prompt_weight=0,
                       device=None, verbose=False):
         if negative_prompt is None:
             negative_prompt = self.negative_prompt
@@ -389,59 +661,81 @@ class AdaFaceWrapper(nn.Module):
             device = self.device
         plain_prompt = prompt
-        prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos)
         if verbose:
             print(f"Subject prompt:\n{prompt}")
-        if do_neg_id_prompt_weight > 0:
-            # Use 'prepend' for the negative prompt, since it's long and we want to make sure
-            # the placeholder tokens are not cut off.
-            negative_prompt0 = negative_prompt
-            negative_prompt      = self.update_prompt(negative_prompt0, placeholder_tokens_pos='prepend')
-            null_negative_prompt = self.update_prompt(negative_prompt0, placeholder_tokens_pos='prepend',
-                                                      use_null_placeholders=True)
-            '''         if verbose:
-                            print(f"Negative prompt:\n{negative_prompt}")
-                            print(f"Null negative prompt:\n{null_negative_prompt}")
-            '''
-        else:
-            null_negative_prompt = None
         # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
         # So we manually move it to GPU here.
         self.pipeline.text_encoder.to(device)
         prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
             self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)
-        if 0 < do_neg_id_prompt_weight < 1:
-            _, negative_prompt_embeds_null, _, _ = \
-                self.diffusers_encode_prompts(prompt, plain_prompt, null_negative_prompt, device)
-            negative_prompt_embeds_ = negative_prompt_embeds_ * do_neg_id_prompt_weight + \
-                                      negative_prompt_embeds_null * (1 - do_neg_id_prompt_weight)
         return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
     # ref_img_strength is used only in the img2img pipeline.
-    def forward(self, noise, prompt, negative_prompt=None,
                 placeholder_tokens_pos='append',
-                do_neg_id_prompt_weight=0,
                 guidance_scale=6.0, out_image_count=4,
-                ref_img_strength=0.8, generator=None, verbose=False):
         noise = noise.to(device=self.device, dtype=torch.float16)
         if negative_prompt is None:
             negative_prompt = self.negative_prompt
         # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
-        prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
-            negative_pooled_prompt_embeds_ = \
-                self.encode_prompt(prompt, negative_prompt,
-                                   placeholder_tokens_pos=placeholder_tokens_pos,
-                                   do_neg_id_prompt_weight=do_neg_id_prompt_weight,
-                                   device=self.device, verbose=verbose)
         # Repeat the prompt embeddings for all images in the batch.
         prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
         if negative_prompt_embeds_ is not None:
             negative_prompt_embeds_ = negative_prompt_embeds_.repeat(out_image_count, 1, 1)

     StableDiffusion3Pipeline,
     #FluxPipeline,
     DDIMScheduler,
+    PNDMScheduler,
+    DPMSolverSinglestepScheduler,
     AutoencoderKL,
+    LCMScheduler,
 )
 from diffusers.loaders.single_file_utils import convert_ldm_unet_checkpoint
 from adaface.util import UNetEnsemble
 from adaface.face_id_to_ada_prompt import create_id2ada_prompt_encoder
+from adaface.diffusers_attn_lora_capture import set_up_attn_processors, set_up_ffn_loras, set_lora_and_capture_flags
 from safetensors.torch import load_file as safetensors_load_file
 import re, os
 import numpy as np
+from peft.utils.constants import DUMMY_TARGET_MODULES
 class AdaFaceWrapper(nn.Module):
     def __init__(self, pipeline_name, base_model_path, adaface_encoder_types,
                  adaface_ckpt_paths, adaface_encoder_cfg_scales=None,
+                 enabled_encoders=None, use_lcm=False, default_scheduler_name='ddim',
+                 num_inference_steps=50, subject_string='z', negative_prompt=None,
                  use_840k_vae=False, use_ds_text_encoder=False,
+                 main_unet_filepath=None, unet_types=None, extra_unet_dirpaths=None, unet_weights_in_ensemble=None,
+                 enable_static_img_suffix_embs=None, unet_uses_attn_lora=False,
+                 attn_lora_layer_names=['q', 'k', 'v', 'out'], shrink_cross_attn=False, q_lora_updates_query=False,
                  device='cuda', is_training=False):
         '''
         pipeline_name: "text2img", "text2imgxl", "img2img", "text2img3", "flux", or None.
         self.adaface_ckpt_paths = adaface_ckpt_paths
         self.adaface_encoder_cfg_scales = adaface_encoder_cfg_scales
         self.enabled_encoders = enabled_encoders
+        # None, or a list of two bools for two encoders. If None, both are disabled.
+        self.enable_static_img_suffix_embs = enable_static_img_suffix_embs
+        self.unet_uses_attn_lora = unet_uses_attn_lora
+        self.attn_lora_layer_names = attn_lora_layer_names
+        self.q_lora_updates_query  = q_lora_updates_query
+        self.use_lcm = use_lcm
         self.subject_string = subject_string
+        self.shrink_cross_attn = shrink_cross_attn
+        self.default_scheduler_name = default_scheduler_name
+        self.num_inference_steps = num_inference_steps if not use_lcm else 4
         self.use_840k_vae = use_840k_vae
         self.use_ds_text_encoder = use_ds_text_encoder
         self.main_unet_filepath = main_unet_filepath
         self.unet_types = unet_types
         self.extra_unet_dirpaths = extra_unet_dirpaths
+        self.unet_weights_in_ensemble = unet_weights_in_ensemble
         self.device = device
         self.is_training = is_training
         self.initialize_pipeline()
         # During inference, we never use static image suffix embeddings.
         # So num_id_vecs is the length of the returned adaface embeddings for each encoder.
+        self.encoders_num_id_vecs = np.array(self.id2ada_prompt_encoder.encoders_num_id_vecs)
+        self.encoders_num_static_img_suffix_embs = np.array(self.id2ada_prompt_encoder.encoders_num_static_img_suffix_embs)
+        if self.enable_static_img_suffix_embs is not None:
+            assert len(self.enable_static_img_suffix_embs) == len(self.encoders_num_id_vecs)
+            self.encoders_num_static_img_suffix_embs *= np.array(self.enable_static_img_suffix_embs)
+            self.encoders_num_id_vecs += self.encoders_num_static_img_suffix_embs
+        self.img_prompt_embs = None
         self.extend_tokenizer_and_text_encoder()
     def to(self, device):
         self.id2ada_prompt_encoder = create_id2ada_prompt_encoder(self.adaface_encoder_types,
                                                                   self.adaface_ckpt_paths,
                                                                   self.adaface_encoder_cfg_scales,
+                                                                  self.enabled_encoders,
+                                                                  num_static_img_suffix_embs=4)
         self.id2ada_prompt_encoder.to(self.device)
         print(f"adaface_encoder_cfg_scales: {self.adaface_encoder_cfg_scales}")
         if self.base_model_path is None:
             base_model_path_dict = {
+                'text2img':     'models/sd15-dste8-vae.safetensors',
+                'text2imgxl':   'stabilityai/stable-diffusion-xl-base-1.0',
+                'text2img3':    'stabilityai/stable-diffusion-3-medium-diffusers',
+                'flux':         'black-forest-labs/FLUX.1-schnell',
             }
             self.base_model_path = base_model_path_dict[self.pipeline_name]
                     safety_checker=None
                 )
+        if self.use_lcm:
+            lcm_path_dict = {
+                'text2img':     'latent-consistency/lcm-lora-sdv1-5',
+                'text2imgxl':   'latent-consistency/lcm-lora-sdxl',
+            }
+            if self.pipeline_name not in lcm_path_dict:
+                raise ValueError(f"Pipeline {self.pipeline_name} does not support LCM.")
+            lcm_path = lcm_path_dict[self.pipeline_name]
+            pipeline.load_lora_weights(lcm_path)
+            pipeline.fuse_lora()
+            print(f"Loaded LCM weights from {lcm_path}.")
+            pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config)
         if self.main_unet_filepath is not None:
             print(f"Replacing the UNet with the UNet from {self.main_unet_filepath}.")
             ret = pipeline.unet.load_state_dict(self.load_unet_from_file(self.main_unet_filepath, device='cpu'))
         if (self.unet_types is not None and len(self.unet_types) > 0) \
           or (self.extra_unet_dirpaths is not None and len(self.extra_unet_dirpaths) > 0):
+            unet_ensemble = UNetEnsemble([pipeline.unet], self.unet_types, self.extra_unet_dirpaths, self.unet_weights_in_ensemble,
                                          device=self.device, torch_dtype=torch.float16)
             pipeline.unet = unet_ensemble
         print(f"Loaded pipeline from {self.base_model_path}.")
+        if not remove_unet and (self.unet_uses_attn_lora or self.shrink_cross_attn):
+            unet2 = self.load_unet_lora_weights(pipeline.unet, use_attn_lora=self.unet_uses_attn_lora,
+                                                attn_lora_layer_names=self.attn_lora_layer_names,
+                                                shrink_cross_attn=self.shrink_cross_attn,
+                                                q_lora_updates_query=self.q_lora_updates_query)
+            pipeline.unet = unet2
         if self.use_840k_vae:
             pipeline.vae = vae
             print("Replaced the VAE with the 840k-step VAE.")
             pipeline.vae  = None
             print("Removed UNet and VAE from the pipeline.")
+        if self.pipeline_name not in ["text2imgxl", "text2img3", "flux"] and not self.use_lcm:
+            if self.default_scheduler_name == 'ddim':
+                noise_scheduler = DDIMScheduler(
+                    num_train_timesteps=1000,
+                    beta_start=0.00085,
+                    beta_end=0.012,
+                    beta_schedule="scaled_linear",
+                    clip_sample=False,
+                    set_alpha_to_one=False,
+                    steps_offset=1,
+                    timestep_spacing="leading",
+                    rescale_betas_zero_snr=False,
+                )
+            elif self.default_scheduler_name == 'pndm':
+                noise_scheduler = PNDMScheduler(
+                    num_train_timesteps=1000,
+                    beta_start=0.00085,
+                    beta_end=0.012,
+                    beta_schedule="scaled_linear",
+                    set_alpha_to_one=False,
+                    steps_offset=1,
+                    timestep_spacing="leading",
+                    skip_prk_steps=True,
+                )
+            elif self.default_scheduler_name == 'dpm++':
+                noise_scheduler = DPMSolverSinglestepScheduler(
+                    beta_start=0.00085,
+                    beta_end=0.012,
+                    beta_schedule="scaled_linear",
+                    prediction_type="epsilon",
+                    num_train_timesteps=1000,
+                    trained_betas=None,
+                    thresholding=False,
+                    algorithm_type="dpmsolver++",
+                    solver_type="midpoint",
+                    lower_order_final=True,
+                    use_karras_sigmas=True,
+                )
+            else:
+                breakpoint()
             pipeline.scheduler = noise_scheduler
+        # Otherwise, if not use_lcm, pipeline.scheduler == FlowMatchEulerDiscreteScheduler
+        #            if     use_lcm, pipeline.scheduler == LCMScheduler
         self.pipeline = pipeline.to(self.device)
+    def set_adaface_encoder_cfg_scales(self, adaface_encoder_cfg_scales):
+        self.adaface_encoder_cfg_scales = adaface_encoder_cfg_scales
+        self.id2ada_prompt_encoder.set_out_id_embs_cfg_scale(adaface_encoder_cfg_scales)
     def load_unet_from_file(self, unet_path, device=None):
         if os.path.isfile(unet_path):
             if unet_path.endswith(".safetensors"):
         else:
             raise ValueError(f"UNet path {unet_path} is not a file.")
         return unet_state_dict
+    # Adapted from ConsistentIDPipeline:set_ip_adapter().
+    def load_unet_loras(self, unet, unet_lora_modules_state_dict,
+                        use_attn_lora=True, use_ffn_lora=False,
+                        attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                        shrink_cross_attn=False, cross_attn_shrink_factor=0.5,
+                        q_lora_updates_query=False):
+        attn_capture_procs, attn_opt_modules = \
+            set_up_attn_processors(unet, use_attn_lora=True, attn_lora_layer_names=attn_lora_layer_names,
+                                   lora_rank=192, lora_scale_down=8,
+                                   cross_attn_shrink_factor=cross_attn_shrink_factor,
+                                   q_lora_updates_query=q_lora_updates_query)
+        # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut. [12] matches 1 or 2.
+        if use_ffn_lora:
+            target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
+        else:
+            # A special pattern, "dummy-target-modules" tells PEFT to add loras on NONE of the layers.
+            # We couldn't simply skip PEFT initialization (converting unet to a PEFT model),
+            # otherwise the attn lora layers will cause nan quickly during a fp16 training.
+            target_modules_pat = DUMMY_TARGET_MODULES
+        unet, ffn_lora_layers, ffn_opt_modules = \
+            set_up_ffn_loras(unet, target_modules_pat=target_modules_pat, lora_uses_dora=True)
+        # self.attn_capture_procs and ffn_lora_layers will be used in set_lora_and_capture_flags().
+        self.attn_capture_procs = list(attn_capture_procs.values())
+        self.ffn_lora_layers    = list(ffn_lora_layers.values())
+        # Combine attn_opt_modules and ffn_opt_modules into unet_lora_modules.
+        # unet_lora_modules is for optimization and loading/saving.
+        unet_lora_modules = {}
+        # attn_opt_modules and ffn_opt_modules have different depths of keys.
+        # attn_opt_modules:
+        # up_blocks_3_attentions_1_transformer_blocks_0_attn2_processor_std_shrink_factor,
+        # up_blocks_3_attentions_1_transformer_blocks_0_attn2_processor_to_q_lora_lora_A, ...
+        # ffn_opt_modules:
+        # base_model_model_up_blocks_3_resnets_1_conv1_lora_A, ...
+        # with the prefix 'base_model_model_'. Because ffn_opt_modules are extracted from the peft-wrapped model,
+        # and attn_opt_modules are extracted from the original unet model.
+        # To be compatible with old param keys, we append 'base_model_model_' to the keys of attn_opt_modules.
+        unet_lora_modules.update({ f'base_model_model_{k}': v for k, v in attn_opt_modules.items() })
+        unet_lora_modules.update(ffn_opt_modules)
+        # ParameterDict can contain both Parameter and nn.Module.
+        # TODO: maybe in the future, we couldn't put nn.Module in nn.ParameterDict.
+        self.unet_lora_modules  = torch.nn.ParameterDict(unet_lora_modules)
+        missing, unexpected = self.unet_lora_modules.load_state_dict(unet_lora_modules_state_dict, strict=False)
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+        print(f"Loaded {len(unet_lora_modules_state_dict)} LoRA weights on the UNet:\n{unet_lora_modules.keys()}")
+        self.outfeat_capture_blocks.append(unet.up_blocks[3])
+        # If shrink_cross_attn is True and use_attn_lora is False, we load all these params from ckpt,
+        # but since we set use_attn_lora to False, attn loras won't be used during inference nonetheless.
+        set_lora_and_capture_flags(unet, None, self.attn_capture_procs, self.outfeat_capture_blocks,
+                                   use_attn_lora, use_ffn_lora, 'recon_loss', capture_ca_activations=False,
+                                   shrink_cross_attn=shrink_cross_attn)
+        return unet
+    def load_unet_lora_weights(self, unet, use_attn_lora=True, attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                               shrink_cross_attn=False, q_lora_updates_query=False):
+        unet_lora_weight_found = False
+        if isinstance(self.adaface_ckpt_paths, str):
+            adaface_ckpt_paths = [self.adaface_ckpt_paths]
+        else:
+            adaface_ckpt_paths = self.adaface_ckpt_paths
+        for adaface_ckpt_path in adaface_ckpt_paths:
+            ckpt_dict = torch.load(adaface_ckpt_path, map_location='cpu')
+            if 'unet_lora_modules' in ckpt_dict:
+                unet_lora_modules_state_dict = ckpt_dict['unet_lora_modules']
+                print(f"{len(unet_lora_modules_state_dict)} LoRA weights found in {adaface_ckpt_path}.")
+                unet_lora_weight_found = True
+                break
+        # Since unet lora weights are not found in the adaface ckpt, we give up on loading unet attn processors.
+        if not unet_lora_weight_found:
+            print(f"LoRA weights not found in {self.adaface_ckpt_paths}.")
+            return unet
+        self.outfeat_capture_blocks = []
+        if isinstance(unet, UNetEnsemble):
+            for i, unet_ in enumerate(unet.unets):
+                unet_ = self.load_unet_loras(unet_, unet_lora_modules_state_dict,
+                                             use_attn_lora=use_attn_lora,
+                                             attn_lora_layer_names=attn_lora_layer_names,
+                                             shrink_cross_attn=shrink_cross_attn,
+                                             q_lora_updates_query=q_lora_updates_query)
+                unet.unets[i] = unet_
+            print(f"Loaded LoRA processors on UNetEnsemble of {len(unet.unets)} UNets.")
+        else:
+            unet = self.load_unet_loras(unet, unet_lora_modules_state_dict,
+                                        use_attn_lora=use_attn_lora,
+                                        attn_lora_layer_names=attn_lora_layer_names,
+                                        shrink_cross_attn=shrink_cross_attn,
+                                        q_lora_updates_query=q_lora_updates_query)
+        return unet
     def extend_tokenizer_and_text_encoder(self):
         if np.sum(self.encoders_num_id_vecs) < 1:
             raise ValueError(f"encoders_num_id_vecs has to be larger or equal to 1, but is {self.encoders_num_id_vecs}")
         # We add z_0_0, z_0_1, z_0_2, ..., z_0_15, z_1_0, z_1_1, z_1_2, z_1_3 to the tokenizer.
         self.all_placeholder_tokens = []
         self.placeholder_tokens_strs = []
+        self.encoder_placeholder_tokens = []
         for i in range(len(self.adaface_encoder_types)):
             placeholder_tokens = []
             for j in range(self.encoders_num_id_vecs[i]):
                 placeholder_tokens_str = " ".join(placeholder_tokens)
             self.all_placeholder_tokens.extend(placeholder_tokens)
+            self.encoder_placeholder_tokens.append(placeholder_tokens)
             self.placeholder_tokens_strs.append(placeholder_tokens_str)
         self.all_placeholder_tokens_str = " ".join(self.placeholder_tokens_strs)
+        self.updated_tokens_str = self.all_placeholder_tokens_str
         # all_null_placeholder_tokens_str: ", , , , ..." (20 times).
         # It just contains the commas and spaces with the same length, but no actual tokens.
         self.all_null_placeholder_tokens_str = " ".join([", "] * len(self.all_placeholder_tokens))
         print(f"Added {num_added_tokens} tokens ({self.all_placeholder_tokens_str}) to the tokenizer.")
+        # placeholder_token_ids: [49408, ..., 49427].
         self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.all_placeholder_tokens)
         #print("New tokens:", self.placeholder_token_ids)
         # Resize the token embeddings as we are adding new special tokens to the tokenizer
     # Extend pipeline.text_encoder with the adaface subject emeddings.
     # subj_embs: [16, 768].
+    def update_text_encoder_subj_embeddings(self, subj_embs, lens_subj_emb_segments):
         # Initialise the newly added placeholder token with the embeddings of the initializer token
         # token_embeds: [49412, 768]
         token_embeds = self.pipeline.text_encoder.get_input_embeddings().weight.data
+        all_encoders_updated_tokens = []
+        all_encoders_updated_token_strs = []
+        idx = 0
         with torch.no_grad():
+            # sum of lens_subj_emb_segments are probably shorter than self.placeholder_token_ids,
+            # when some static_img_suffix_embs are disabled.
+            for i, encoder_type in enumerate(self.adaface_encoder_types):
+                encoder_updated_tokens = []
+                if (self.enabled_encoders is not None) and (encoder_type not in self.enabled_encoders):
+                    idx += lens_subj_emb_segments[i]
+                    continue
+                for j in range(lens_subj_emb_segments[i]):
+                    placeholder_token = f"{self.subject_string}_{i}_{j}"
+                    token_id = self.pipeline.tokenizer.convert_tokens_to_ids(placeholder_token)
+                    token_embeds[token_id] = subj_embs[idx]
+                    encoder_updated_tokens.append(placeholder_token)
+                    idx += 1
+                all_encoders_updated_tokens.extend(encoder_updated_tokens)
+                all_encoders_updated_token_strs.append(" ".join(encoder_updated_tokens))
+            self.updated_tokens_str = " ".join(all_encoders_updated_token_strs)
+            self.all_encoders_updated_token_strs = all_encoders_updated_token_strs
+            print(f"Updated {len(all_encoders_updated_tokens)} tokens ({self.updated_tokens_str}) in the text encoder.")
     def update_prompt(self, prompt, placeholder_tokens_pos='append',
+                      repeat_prompt_for_each_encoder=True,
                       use_null_placeholders=False):
         if prompt is None:
             prompt = ""
         if use_null_placeholders:
             all_placeholder_tokens_str = self.all_null_placeholder_tokens_str
+            if not re.search(r"\b(man|woman|person|child|girl|boy)\b", prompt.lower()):
+                all_placeholder_tokens_str = "person " + all_placeholder_tokens_str
+            repeat_prompt_for_each_encoder = False
         else:
+            all_placeholder_tokens_str = self.updated_tokens_str
         # Delete the subject_string from the prompt.
         prompt = re.sub(r'\b(a|an|the)\s+' + self.subject_string + r'\b,?', "", prompt)
         # When we do joint training, seems both work better if they are appended to the prompt.
         # Therefore we simply appended all placeholder_tokens_str's to the prompt.
         # NOTE: Prepending them hurts compositional prompts.
+        if repeat_prompt_for_each_encoder:
+            encoder_prompts = []
+            for encoder_updated_token_strs in self.all_encoders_updated_token_strs:
+                if placeholder_tokens_pos == 'prepend':
+                    encoder_prompt = encoder_updated_token_strs + " " + prompt
+                elif placeholder_tokens_pos == 'append':
+                    encoder_prompt = prompt + " " + encoder_updated_token_strs
+                else:
+                    breakpoint()
+                encoder_prompts.append(encoder_prompt)
+            prompt = ", ".join(encoder_prompts)
         else:
+            if placeholder_tokens_pos == 'prepend':
+                prompt = all_placeholder_tokens_str + " " + prompt
+            elif placeholder_tokens_pos == 'append':
+                prompt = prompt + " " + all_placeholder_tokens_str
+            else:
+                breakpoint()
         return prompt
+    # NOTE: all_adaface_subj_embs is the input to the CLIP text encoder.
+    # ** DO NOT use it as prompt_embeds in the forward() method.
     # If face_id_embs is None, then it extracts face_id_embs from the images,
     # then map them to ada prompt embeddings.
     # avg_at_stage: 'id_emb', 'img_prompt_emb', or None.
                                    perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
                                    perturb_std=0, update_text_encoder=True):
+        all_adaface_subj_embs, img_prompt_embs, lens_subj_emb_segments = \
             self.id2ada_prompt_encoder.generate_adaface_embeddings(\
                     image_paths, face_id_embs=face_id_embs,
                     img_prompt_embs=None,
                     avg_at_stage=avg_at_stage,
                     perturb_at_stage=perturb_at_stage,
                     perturb_std=perturb_std,
+                    enable_static_img_suffix_embs=self.enable_static_img_suffix_embs)
         if all_adaface_subj_embs is None:
             return None
+        self.img_prompt_embs = img_prompt_embs
         if all_adaface_subj_embs.ndim == 4:
+            # [1, 1, 20, 768] -> [20, 768]
             all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0).squeeze(0)
         elif all_adaface_subj_embs.ndim == 3:
+            # [1, 20, 768] -> [20, 768]
             all_adaface_subj_embs = all_adaface_subj_embs.squeeze(0)
         if update_text_encoder:
+            self.update_text_encoder_subj_embeddings(all_adaface_subj_embs, lens_subj_emb_segments)
         return all_adaface_subj_embs
     def diffusers_encode_prompts(self, prompt, plain_prompt, negative_prompt, device):
                 else:
                     breakpoint()
             else:
+                # "text2img" and "img2img" pipelines.
                 # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
                 prompt_embeds_, negative_prompt_embeds_ = \
                     self.pipeline.encode_prompt(prompt, device=device,
         return prompt_embeds_, negative_prompt_embeds_, \
                pooled_prompt_embeds_, negative_pooled_prompt_embeds_
+    # alt_prompt_embed_type: 'ada-nonmix', 'img'
+    def mix_ada_embs_with_other_embs(self, prompt, prompt_embeds,
+                                     alt_prompt_embed_type, alt_prompt_emb_weights):
+        # Scan prompt and replace tokens in self.placeholder_token_ids
+        # with the corresponding image embeddings.
+        prompt_tokens = self.pipeline.tokenizer.tokenize(prompt)
+        prompt_embeds2 = prompt_embeds.clone()
+        if alt_prompt_embed_type == 'img':
+            if self.img_prompt_embs is None:
+                print("Unable to find img_prompt_embs. Either prepare_adaface_embeddings() hasn't been called, or faceless images were used.")
+                return prompt_embeds
+            # self.img_prompt_embs: [1, 20, 768]
+            repl_embeddings = self.img_prompt_embs
+        elif alt_prompt_embed_type == 'ada-nonmix':
+            repl_embeddings_, _, _, _ = self.encode_prompt(prompt, ablate_prompt_only_placeholders=True,
+                                                           verbose=True)
+            # repl_embeddings_: [1, 77, 768] -> [1, 20, 768]
+            repl_embeddings = repl_embeddings_[:, 1:len(self.all_placeholder_tokens)+1]
+        else:
+            breakpoint()
+        repl_tokens = {}
+        for i in range(len(prompt_tokens)):
+            if prompt_tokens[i] in self.all_placeholder_tokens:
+                encoder_idx = next((i for i, sublist in enumerate(self.encoder_placeholder_tokens) \
+                                    if prompt_tokens[i] in sublist), 0)
+                alt_prompt_emb_weight = alt_prompt_emb_weights[encoder_idx]
+                prompt_embeds2[:, i] = prompt_embeds2[:, i] * (1 - alt_prompt_emb_weight) \
+                                       + repl_embeddings[:, self.all_placeholder_tokens.index(prompt_tokens[i])] * alt_prompt_emb_weight
+                repl_tokens[prompt_tokens[i]] = 1
+        repl_token_count = len(repl_tokens)
+        if np.all(np.array(alt_prompt_emb_weights) == 1):
+            print(f"Replaced {repl_token_count} tokens with {alt_prompt_embed_type} embeddings.")
+        else:
+            print(f"Mixed {repl_token_count} tokens with {alt_prompt_embed_type} embeddings, weight {alt_prompt_emb_weights}.")
+        return prompt_embeds2
     def encode_prompt(self, prompt, negative_prompt=None,
                       placeholder_tokens_pos='append',
+                      ablate_prompt_only_placeholders=False,
+                      ablate_prompt_no_placeholders=False,
+                      ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img'
+                      nonmix_prompt_emb_weight=0,
+                      repeat_prompt_for_each_encoder=True,
                       device=None, verbose=False):
         if negative_prompt is None:
             negative_prompt = self.negative_prompt
             device = self.device
         plain_prompt = prompt
+        if ablate_prompt_only_placeholders:
+            prompt = self.updated_tokens_str
+        else:
+            prompt = self.update_prompt(prompt, placeholder_tokens_pos=placeholder_tokens_pos,
+                                        repeat_prompt_for_each_encoder=repeat_prompt_for_each_encoder,
+                                        use_null_placeholders=ablate_prompt_no_placeholders)
         if verbose:
             print(f"Subject prompt:\n{prompt}")
         # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
         # So we manually move it to GPU here.
         self.pipeline.text_encoder.to(device)
         prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = \
             self.diffusers_encode_prompts(prompt, plain_prompt, negative_prompt, device)
+        if ablate_prompt_embed_type != 'ada':
+            alt_prompt_embed_type = ablate_prompt_embed_type
+            alt_prompt_emb_weights = (1, 1)
+        elif nonmix_prompt_emb_weight > 0:
+            alt_prompt_embed_type = 'ada-nonmix'
+            alt_prompt_emb_weights = (nonmix_prompt_emb_weight, nonmix_prompt_emb_weight)
+        else:
+            alt_prompt_emb_weights = (0, 0)
+        if sum(alt_prompt_emb_weights) > 0:
+            prompt_embeds_ = self.mix_ada_embs_with_other_embs(prompt, prompt_embeds_,
+                                                               alt_prompt_embed_type, alt_prompt_emb_weights)
         return prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, negative_pooled_prompt_embeds_
     # ref_img_strength is used only in the img2img pipeline.
+    def forward(self, noise, prompt, prompt_embeds=None, negative_prompt=None,
                 placeholder_tokens_pos='append',
                 guidance_scale=6.0, out_image_count=4,
+                ref_img_strength=0.8, generator=None,
+                ablate_prompt_only_placeholders=False,
+                ablate_prompt_no_placeholders=False,
+                ablate_prompt_embed_type='ada', # 'ada', 'ada-nonmix', 'img'
+                nonmix_prompt_emb_weight=0,
+                repeat_prompt_for_each_encoder=True,
+                verbose=False):
         noise = noise.to(device=self.device, dtype=torch.float16)
+        if self.use_lcm:
+            guidance_scale = 0
         if negative_prompt is None:
             negative_prompt = self.negative_prompt
         # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
+        if prompt_embeds is None:
+            prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
+                negative_pooled_prompt_embeds_ = \
+                    self.encode_prompt(prompt, negative_prompt,
+                                       placeholder_tokens_pos=placeholder_tokens_pos,
+                                       ablate_prompt_only_placeholders=ablate_prompt_only_placeholders,
+                                       ablate_prompt_no_placeholders=ablate_prompt_no_placeholders,
+                                       ablate_prompt_embed_type=ablate_prompt_embed_type,
+                                       nonmix_prompt_emb_weight=nonmix_prompt_emb_weight,
+                                       repeat_prompt_for_each_encoder=repeat_prompt_for_each_encoder,
+                                       device=self.device,
+                                       verbose=verbose)
+        else:
+            if len(prompt_embeds) == 2:
+                prompt_embeds_, negative_prompt_embeds_ = prompt_embeds
+                pooled_prompt_embeds_, negative_pooled_prompt_embeds_ = None, None
+            elif len(prompt_embeds) == 4:
+                prompt_embeds_, negative_prompt_embeds_, pooled_prompt_embeds_, \
+                    negative_pooled_prompt_embeds_ = prompt_embeds
+            else:
+                breakpoint()
         # Repeat the prompt embeddings for all images in the batch.
         prompt_embeds_ = prompt_embeds_.repeat(out_image_count, 1, 1)
         if negative_prompt_embeds_ is not None:
             negative_prompt_embeds_ = negative_prompt_embeds_.repeat(out_image_count, 1, 1)

adaface/diffusers_attn_lora_capture.py ADDED Viewed

	@@ -0,0 +1,656 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Dict, Any
+from diffusers.models.attention_processor import Attention, AttnProcessor2_0
+from diffusers.utils import logging, is_torch_version, deprecate
+from diffusers.utils.torch_utils import fourier_filter
+# UNet is a diffusers PeftAdapterMixin instance.
+from diffusers.loaders.peft import PeftAdapterMixin
+from peft import LoraConfig, get_peft_model
+import peft.tuners.lora as peft_lora
+from peft.tuners.lora.dora import DoraLinearLayer
+from einops import rearrange
+import math, re
+import numpy as np
+from peft.tuners.tuners_utils import BaseTunerLayer
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def dummy_func(*args, **kwargs):
+    pass
+# Revised from RevGrad, by removing the grad negation.
+class ScaleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, alpha_, debug=False):
+        ctx.save_for_backward(alpha_, debug)
+        output = input_
+        if debug:
+            print(f"input: {input_.abs().mean().item()}")
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):  # pragma: no cover
+        # saved_tensors returns a tuple of tensors.
+        alpha_, debug = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            grad_output2 = grad_output * alpha_
+            if debug:
+                print(f"grad_output2: {grad_output2.abs().mean().item()}")
+        else:
+            grad_output2 = None
+        return grad_output2, None, None
+class GradientScaler(nn.Module):
+    def __init__(self, alpha=1., debug=False, *args, **kwargs):
+        """
+        A gradient scaling layer.
+        This layer has no parameters, and simply scales the gradient in the backward pass.
+        """
+        super().__init__(*args, **kwargs)
+        self._alpha = torch.tensor(alpha, requires_grad=False)
+        self._debug = torch.tensor(debug, requires_grad=False)
+    def forward(self, input_):
+        _debug = self._debug if hasattr(self, '_debug') else False
+        return ScaleGrad.apply(input_, self._alpha.to(input_.device), _debug)
+def gen_gradient_scaler(alpha, debug=False):
+    if alpha == 1:
+        return nn.Identity()
+    if alpha > 0:
+        return GradientScaler(alpha, debug=debug)
+    else:
+        assert alpha == 0
+        # Don't use lambda function here, otherwise the object can't be pickled.
+        return torch.detach
+def split_indices_by_instance(indices, as_dict=False):
+    indices_B, indices_N = indices
+    unique_indices_B = torch.unique(indices_B)
+    if not as_dict:
+        indices_by_instance = [ (indices_B[indices_B == uib], indices_N[indices_B == uib]) for uib in unique_indices_B ]
+    else:
+        indices_by_instance = { uib.item(): indices_N[indices_B == uib] for uib in unique_indices_B }
+    return indices_by_instance
+# If do_sum, returned emb_attns is 3D. Otherwise 4D.
+# indices are applied on the first 2 dims of attn_mat.
+def sel_emb_attns_by_indices(attn_mat, indices, all_token_weights=None, do_sum=True, do_mean=False):
+    indices_by_instance = split_indices_by_instance(indices)
+    # emb_attns[0]: [1, 9, 8, 64]
+    # 8: 8 attention heads. Last dim 64: number of image tokens.
+    emb_attns   = [ attn_mat[inst_indices].unsqueeze(0) for inst_indices in indices_by_instance ]
+    if all_token_weights is not None:
+        # all_token_weights: [4, 77].
+        # token_weights_by_instance[0]: [1, 9, 1, 1].
+        token_weights = [ all_token_weights[inst_indices].reshape(1, -1, 1, 1) for inst_indices in indices_by_instance ]
+    else:
+        token_weights = [ 1 ] * len(indices_by_instance)
+    # Apply token weights.
+    emb_attns = [ emb_attns[i] * token_weights[i] for i in range(len(indices_by_instance)) ]
+    # sum among K_subj_i subj embeddings -> [1, 8, 64]
+    if do_sum:
+        emb_attns   = [ emb_attns[i].sum(dim=1) for i in range(len(indices_by_instance)) ]
+    elif do_mean:
+        emb_attns   = [ emb_attns[i].mean(dim=1) for i in range(len(indices_by_instance)) ]
+    emb_attns = torch.cat(emb_attns, dim=0)
+    return emb_attns
+# Slow implementation equivalent to F.scaled_dot_product_attention.
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
+                                 shrink_cross_attn=False, cross_attn_shrink_factor=0.5,
+                                 is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+    B, L, S = query.size(0), query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    # 1: head (to be broadcasted). L: query length. S: key length.
+    attn_bias = torch.zeros(B, 1, L, S, device=query.device, dtype=query.dtype)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(B, 1, L, S, device=query.device, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias += attn_mask
+    if enable_gqa:
+        key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+        value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    if shrink_cross_attn:
+        cross_attn_scale = cross_attn_shrink_factor
+    else:
+        cross_attn_scale = 1
+    # attn_bias: [1, 1, 4096, 77], the same size as a single-head attn_weight.
+    attn_weight += attn_bias
+    attn_score = attn_weight
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    # NOTE: After scaling, the "probabilities" of the subject embeddings will sum to < 1.
+    # But this is intended, as we want to scale down the impact of the subject embeddings
+    # in the computed attention output tensors.
+    attn_weight = attn_weight * cross_attn_scale
+    attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+    output = attn_weight @ value
+    return output, attn_score, attn_weight
+# All layers share the same attention processor instance.
+class AttnProcessor_LoRA_Capture(nn.Module):
+    r"""
+    Revised from AttnProcessor2_0
+    """
+    # lora_proj_layers is a dict of lora_layer_name -> lora_proj_layer.
+    def __init__(self, capture_ca_activations: bool = False, enable_lora: bool = False,
+                 lora_uses_dora=True, lora_proj_layers=None,
+                 lora_rank: int = 192, lora_alpha: float = 16,
+                 cross_attn_shrink_factor: float = 0.5,
+                 q_lora_updates_query=False, attn_proc_idx=-1):
+        super().__init__()
+        self.global_enable_lora = enable_lora
+        self.attn_proc_idx = attn_proc_idx
+        # reset_attn_cache_and_flags() sets the local (call-specific) self.enable_lora flag.
+        # By default, shrink_cross_attn is False. Later in layers 22, 23, 24 it will be set to True.
+        self.reset_attn_cache_and_flags(capture_ca_activations, False, enable_lora)
+        self.lora_rank = lora_rank
+        self.lora_alpha = lora_alpha
+        self.lora_scale = self.lora_alpha / self.lora_rank
+        self.cross_attn_shrink_factor = cross_attn_shrink_factor
+        self.q_lora_updates_query = q_lora_updates_query
+        self.to_q_lora = self.to_k_lora = self.to_v_lora = self.to_out_lora = None
+        if self.global_enable_lora:
+            for lora_layer_name, lora_proj_layer in lora_proj_layers.items():
+                if lora_layer_name == 'q':
+                    self.to_q_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
+                                                        use_dora=lora_uses_dora, lora_dropout=0.1)
+                elif lora_layer_name == 'k':
+                    self.to_k_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
+                                                        use_dora=lora_uses_dora, lora_dropout=0.1)
+                elif lora_layer_name == 'v':
+                    self.to_v_lora   = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
+                                                        use_dora=lora_uses_dora, lora_dropout=0.1)
+                elif lora_layer_name == 'out':
+                    self.to_out_lora = peft_lora.Linear(lora_proj_layer, 'default', r=lora_rank, lora_alpha=lora_alpha,
+                                                        use_dora=lora_uses_dora, lora_dropout=0.1)
+    # LoRA layers can be enabled/disabled dynamically.
+    def reset_attn_cache_and_flags(self, capture_ca_activations, shrink_cross_attn, enable_lora):
+        self.capture_ca_activations = capture_ca_activations
+        self.shrink_cross_attn      = shrink_cross_attn
+        self.cached_activations     = {}
+        # Only enable LoRA for the next call(s) if global_enable_lora is set to True.
+        self.enable_lora = enable_lora and self.global_enable_lora
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        img_mask: Optional[torch.Tensor] = None,
+        subj_indices: Optional[Tuple[torch.IntTensor, torch.IntTensor]] = None,
+        debug: bool = False,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        # hidden_states: [1, 4096, 320]
+        residual = hidden_states
+        # attn.spatial_norm is None.
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            # Collapse the spatial dimensions to a single token dimension.
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        # NOTE: there's a inconsistency between q lora and k, v loras.
+        # k, v loras are directly applied to key and value (currently k, v loras are never enabled),
+        # while q lora is applied to query2, and we keep the query unchanged.
+        if self.enable_lora and self.to_q_lora is not None:
+            # query2 will be used in ldm/util.py:calc_elastic_matching_loss() to get more accurate
+            # cross attention scores between the latent images of the sc and mc instances.
+            query2 = self.to_q_lora(hidden_states)
+            # If not q_lora_updates_query, only query2 will be impacted by the LoRA layer.
+            # The query, and thus the attention score and attn_out, will be the same
+            # as the original ones.
+            if self.q_lora_updates_query:
+                query = query2
+        else:
+            query2 = query
+        scale = 1 / math.sqrt(query.size(-1))
+        is_cross_attn = (encoder_hidden_states is not None)
+        if (not is_cross_attn) and (img_mask is not None):
+            # NOTE: we assume the image is square. But this will fail if the image is not square.
+            # hidden_states: [BS, 4096, 320]. img_mask: [BS, 1, 64, 64]
+            # Scale the mask to the same size as hidden_states.
+            mask_size = int(math.sqrt(hidden_states.shape[-2]))
+            img_mask = F.interpolate(img_mask, size=(mask_size, mask_size), mode='nearest')
+            if (img_mask.sum(dim=(2, 3)) == 0).any():
+                img_mask = None
+            else:
+                # img_mask: [2, 1, 64, 64] -> [2, 4096]
+                img_mask = rearrange(img_mask, 'b ... -> b (...)').contiguous()
+                # max_neg_value = -torch.finfo(hidden_states.dtype).max
+                # img_mask: [2, 4096] -> [2, 1, 1, 4096]
+                img_mask = rearrange(img_mask.bool(), 'b j -> b () () j')
+                # attn_score: [16, 4096, 4096]. img_mask will be broadcasted to [16, 4096, 4096].
+                # So some rows in dim 1 (e.g. [0, :, 4095]) of attn_score will be masked out (all elements in [0, :, 4095] is -inf).
+                # But not all elements in [0, 4095, :] is -inf. Since the softmax is done along dim 2, this is fine.
+                # attn_score.masked_fill_(~img_mask, max_neg_value)
+                # NOTE: If there's an attention mask, it will be replaced by img_mask.
+                attention_mask = img_mask
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.enable_lora and self.to_k_lora is not None:
+            key = self.to_k_lora(encoder_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+        if self.enable_lora and self.to_v_lora is not None:
+            value = self.to_v_lora(encoder_hidden_states)
+        else:
+            value = attn.to_v(encoder_hidden_states)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+            query2 = attn.norm_q(query2)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query  = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        query2 = query2.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key   = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if debug and self.attn_proc_idx >= 0:
+            breakpoint()
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        if is_cross_attn and (self.capture_ca_activations or self.shrink_cross_attn):
+            hidden_states, attn_score, attn_prob = \
+                scaled_dot_product_attention(query, key, value, attn_mask=attention_mask,
+                                             dropout_p=0.0, shrink_cross_attn=self.shrink_cross_attn,
+                                             cross_attn_shrink_factor=self.cross_attn_shrink_factor)
+        else:
+            # Use the faster implementation of scaled_dot_product_attention
+            # when not capturing the activations or suppressing the subject attention.
+            hidden_states = \
+                F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+            attn_prob = attn_score = None
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        if self.enable_lora and self.to_out_lora is not None:
+            hidden_states = self.to_out_lora(hidden_states)
+        else:
+            hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        if is_cross_attn and self.capture_ca_activations:
+            # cached q will be used in ddpm.py:calc_comp_fg_bg_preserve_loss(), in which two qs will multiply each other.
+            # So sqrt(scale) will scale the product of two qs by scale.
+            # ANCHOR[id=attention_caching]
+            # query: [2, 8, 4096, 40] -> [2, 320, 4096]
+            self.cached_activations['q'] = \
+                rearrange(query, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
+            self.cached_activations['q2'] = \
+                rearrange(query2, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
+            self.cached_activations['k'] = \
+                rearrange(key, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
+            self.cached_activations['v'] = \
+                rearrange(value, 'b h n d -> b (h d) n').contiguous() * math.sqrt(scale)
+            # attn_prob, attn_score: [2, 8, 4096, 77]
+            self.cached_activations['attn'] = attn_prob
+            self.cached_activations['attnscore'] = attn_score
+            # attn_out: [b, n, h * d] -> [b, h * d, n]
+            # [2, 4096, 320] -> [2, 320, 4096].
+            self.cached_activations['attn_out'] = hidden_states.permute(0, 2, 1).contiguous()
+        return hidden_states
+def CrossAttnUpBlock2D_forward_capture(
+    self,
+    hidden_states: torch.Tensor,
+    res_hidden_states_tuple: Tuple[torch.Tensor, ...],
+    temb: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    upsample_size: Optional[int] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if cross_attention_kwargs is not None:
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+    self.cached_outfeats = {}
+    res_hidden_states_gradscale = getattr(self, "res_hidden_states_gradscale", 1)
+    capture_outfeats            = getattr(self, "capture_outfeats", False)
+    layer_idx = 0
+    res_grad_scaler = gen_gradient_scaler(res_hidden_states_gradscale)
+    for resnet, attn in zip(self.resnets, self.attentions):
+        # pop res hidden states
+        res_hidden_states = res_hidden_states_tuple[-1]
+        res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+        # Scale down the magnitudes of gradients to res_hidden_states
+        # by res_hidden_states_gradscale=0.2, to match the scale of the cross-attn layer outputs.
+        res_hidden_states = res_grad_scaler(res_hidden_states)
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+                return custom_forward
+            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(resnet),
+                hidden_states,
+                temb,
+                **ckpt_kwargs,
+            )
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+        else:
+            # resnet: ResnetBlock2D instance.
+            #LINK diffusers.models.resnet.ResnetBlock2D
+            # up_blocks.3.resnets.2.conv_shortcut is a module within ResnetBlock2D,
+            # it's not transforming the UNet shortcut features.
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+        if capture_outfeats:
+            self.cached_outfeats[layer_idx] = hidden_states
+            layer_idx += 1
+    if self.upsamplers is not None:
+        for upsampler in self.upsamplers:
+            hidden_states = upsampler(hidden_states, upsample_size)
+    return hidden_states
+# Adapted from ConsistentIDPipeline:set_ip_adapter().
+# attn_lora_layer_names: candidates are subsets of ['q', 'k', 'v', 'out'].
+def set_up_attn_processors(unet, use_attn_lora, attn_lora_layer_names=['q', 'k', 'v', 'out'],
+                           lora_rank=192, lora_scale_down=8, cross_attn_shrink_factor=0.5,
+                           q_lora_updates_query=False):
+    attn_procs = {}
+    attn_capture_procs = {}
+    unet_modules = dict(unet.named_modules())
+    attn_opt_modules = {}
+    attn_proc_idx = 0
+    for name, attn_proc in unet.attn_processors.items():
+        # Only capture the activations of the last 3 CA layers.
+        if not name.startswith("up_blocks.3"):
+            # Not the last 3 CA layers. Don't enable LoRA or capture activations.
+            # Then the layer falls back to the original attention mechanism.
+            # We still use AttnProcessor_LoRA_Capture, as it can handle img_mask.
+            attn_procs[name] = AttnProcessor_LoRA_Capture(
+                capture_ca_activations=False, enable_lora=False, attn_proc_idx=-1)
+            continue
+        # cross_attention_dim: 768.
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if cross_attention_dim is None:
+            # Self attention. Don't enable LoRA or capture activations.
+            # We replace the default attn_proc with AttnProcessor_LoRA_Capture,
+            # so that it can incorporate img_mask into self-attention.
+            attn_procs[name] = AttnProcessor_LoRA_Capture(
+                capture_ca_activations=False, enable_lora=False, attn_proc_idx=-1)
+            continue
+        # block_id = 3
+        # hidden_size: 320
+        # hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        # 'up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor' ->
+        # 'up_blocks.3.attentions.1.transformer_blocks.0.attn2.to_q'
+        lora_layer_dict = {}
+        lora_layer_dict['q']    = unet_modules[name[:-9] + "to_q"]
+        lora_layer_dict['k']    = unet_modules[name[:-9] + "to_k"]
+        lora_layer_dict['v']    = unet_modules[name[:-9] + "to_v"]
+        # to_out is a ModuleList(Linear, Dropout).
+        lora_layer_dict['out']  = unet_modules[name[:-9] + "to_out"][0]
+        lora_proj_layers = {}
+        # Only apply LoRA to the specified layers.
+        for lora_layer_name in attn_lora_layer_names:
+            lora_proj_layers[lora_layer_name] = lora_layer_dict[lora_layer_name]
+        attn_capture_proc = AttnProcessor_LoRA_Capture(
+            capture_ca_activations=True, enable_lora=use_attn_lora,
+            lora_uses_dora=True, lora_proj_layers=lora_proj_layers,
+            # LoRA up is initialized to 0. So no need to worry that the LoRA output may be too large.
+            lora_rank=lora_rank, lora_alpha=lora_rank // lora_scale_down,
+            cross_attn_shrink_factor=cross_attn_shrink_factor,
+            q_lora_updates_query=q_lora_updates_query, attn_proc_idx=attn_proc_idx)
+        attn_proc_idx += 1
+        # attn_procs has to use the original names.
+        attn_procs[name] = attn_capture_proc
+        # ModuleDict doesn't allow "." in the key.
+        name = name.replace(".", "_")
+        attn_capture_procs[name] = attn_capture_proc
+        if use_attn_lora:
+            for subname, module in attn_capture_proc.named_modules():
+                if isinstance(module, peft_lora.LoraLayer):
+                    # ModuleDict doesn't allow "." in the key.
+                    lora_path = name + "_" + subname.replace(".", "_")
+                    attn_opt_modules[lora_path + "_lora_A"] = module.lora_A
+                    attn_opt_modules[lora_path + "_lora_B"] = module.lora_B
+                    # lora_uses_dora is always True, so we don't check it here.
+                    attn_opt_modules[lora_path + "_lora_magnitude_vector"] = module.lora_magnitude_vector
+                    # We will manage attn adapters directly. By default, LoraLayer is an instance of BaseTunerLayer,
+                    # so according to the code logic in diffusers/loaders/peft.py,
+                    # they will be managed by the diffusers PeftAdapterMixin instance, through the
+                    # enable_adapters(), and set_adapter() methods.
+                    # Therefore, we disable these calls on module.
+                    # disable_adapters() is a property and changing it will cause exceptions.
+                    module.enable_adapters  = dummy_func
+                    module.set_adapter      = dummy_func
+    unet.set_attn_processor(attn_procs)
+    print(f"Set up {len(attn_capture_procs)} CrossAttn processors on {attn_capture_procs.keys()}.")
+    print(f"Set up {len(attn_opt_modules)} attn LoRA params: {attn_opt_modules.keys()}.")
+    return attn_capture_procs, attn_opt_modules
+# NOTE: cross-attn layers are included in the returned lora_modules.
+def set_up_ffn_loras(unet, target_modules_pat, lora_uses_dora=False, lora_rank=192, lora_alpha=16):
+    # target_modules_pat = 'up_blocks.3.resnets.[12].conv[a-z0-9_]+'
+    # up_blocks.3.resnets.[1~2].conv1, conv2, conv_shortcut
+    # Cannot set to conv.+ as it will match added adapter module names, including
+    # up_blocks.3.resnets.1.conv1.base_layer, up_blocks.3.resnets.1.conv1.lora_dropout
+    if target_modules_pat is not None:
+        peft_config = LoraConfig(use_dora=lora_uses_dora, inference_mode=False, r=lora_rank,
+                                 lora_alpha=lora_alpha, lora_dropout=0.1,
+                                 target_modules=target_modules_pat)
+        # UNet is a diffusers PeftAdapterMixin instance. Using get_peft_model on it will
+        # cause weird errors. Instead, we directly use diffusers peft adapter methods.
+        unet.add_adapter(peft_config, "recon_loss")
+        unet.add_adapter(peft_config, "unet_distill")
+        unet.add_adapter(peft_config, "comp_distill")
+        unet.enable_adapters()
+    # lora_layers contain both the LoRA A and B matrices, as well as the original layers.
+    # lora_layers are used to set the flag, not used for optimization.
+    # lora_modules contain only the LoRA A and B matrices, so they are used for optimization.
+    # NOTE: lora_modules contain both ffn and cross-attn lora modules.
+    ffn_lora_layers = {}
+    ffn_opt_modules = {}
+    for name, module in unet.named_modules():
+        if isinstance(module, peft_lora.LoraLayer):
+            # We don't want to include cross-attn layers in ffn_lora_layers.
+            if target_modules_pat is not None and re.search(target_modules_pat, name):
+                ffn_lora_layers[name] = module
+                # ModuleDict doesn't allow "." in the key.
+                name = name.replace(".", "_")
+                # Since ModuleDict doesn't allow "." in the key, we manually collect
+                # the LoRA matrices in each module.
+                # NOTE: We cannot put every sub-module of module into lora_modules,
+                # as base_layer is also a sub-module of module, which we shouldn't optimize.
+                # Each value in ffn_opt_modules is a ModuleDict:
+                '''
+                    (Pdb) ffn_opt_modules['up_blocks_3_resnets_1_conv1_lora_A']
+                    ModuleDict(
+                    (unet_distill): Conv2d(640, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+                    (recon_loss): Conv2d(640, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
+                    )
+                '''
+                ffn_opt_modules[name + "_lora_A"] = module.lora_A
+                ffn_opt_modules[name + "_lora_B"] = module.lora_B
+                if lora_uses_dora:
+                    ffn_opt_modules[name + "_lora_magnitude_vector"] = module.lora_magnitude_vector
+    print(f"Set up {len(ffn_lora_layers)} FFN LoRA layers: {ffn_lora_layers.keys()}.")
+    print(f"Set up {len(ffn_opt_modules)} FFN LoRA params: {ffn_opt_modules.keys()}.")
+    return ffn_lora_layers, ffn_opt_modules
+def set_lora_and_capture_flags(unet, unet_lora_modules, attn_capture_procs,
+                               outfeat_capture_blocks, res_hidden_states_gradscale_blocks,
+                               use_attn_lora, use_ffn_lora, ffn_lora_adapter_name, capture_ca_activations,
+                               shrink_cross_attn, res_hidden_states_gradscale):
+    # For attn capture procs, capture_ca_activations and use_attn_lora are set in reset_attn_cache_and_flags().
+    for attn_capture_proc in attn_capture_procs:
+        attn_capture_proc.reset_attn_cache_and_flags(capture_ca_activations, shrink_cross_attn, enable_lora=use_attn_lora)
+    # outfeat_capture_blocks only contains the last up block, up_blocks[3].
+    # It contains 3 FFN layers. We want to capture their output features.
+    for block in outfeat_capture_blocks:
+        block.capture_outfeats           = capture_ca_activations
+    for block in res_hidden_states_gradscale_blocks:
+        block.res_hidden_states_gradscale = res_hidden_states_gradscale
+    if not use_ffn_lora:
+        unet.disable_adapters()
+    else:
+        # ffn_lora_adapter_name: 'recon_loss', 'unet_distill', 'comp_distill'.
+        if ffn_lora_adapter_name is not None:
+            unet.set_adapter(ffn_lora_adapter_name)
+            # NOTE: Don't forget to enable_adapters().
+            # The adapters are not enabled by default after set_adapter().
+            unet.enable_adapters()
+        else:
+            breakpoint()
+    # During training, disable_adapters() and set_adapter() will set all/inactive adapters with requires_grad=False,
+    # which might cause issues during DDP training.
+    # So we restore them to requires_grad=True.
+    # During test, unet_lora_modules will be passed as None, so this block will be skipped.
+    if unet_lora_modules is not None:
+        for param in unet_lora_modules.parameters():
+            param.requires_grad = True
+def get_captured_activations(capture_ca_activations, attn_capture_procs, outfeat_capture_blocks,
+                             captured_layer_indices=[22, 23, 24], out_dtype=torch.float32):
+    captured_activations = { k: {} for k in ('outfeat', 'attn', 'attnscore',
+                                             'q', 'q2', 'k', 'v', 'attn_out') }
+    if not capture_ca_activations:
+        return captured_activations
+    all_cached_outfeats = []
+    for block in outfeat_capture_blocks:
+        all_cached_outfeats.append(block.cached_outfeats)
+        # Clear the capture flag and cached outfeats.
+        block.cached_outfeats = {}
+        block.capture_outfeats = False
+    for layer_idx in captured_layer_indices:
+        # Subtract 22 to ca_layer_idx to match the layer index in up_blocks[3].cached_outfeats.
+        # 23, 24 -> 1, 2 (!! not 0, 1 !!)
+        internal_idx = layer_idx - 22
+        for k in captured_activations.keys():
+            if k == 'outfeat':
+                # Currently we only capture one block, up_blocks.3. So we hard-code the index 0.
+                captured_activations['outfeat'][layer_idx] = all_cached_outfeats[0][internal_idx].to(out_dtype)
+            else:
+                # internal_idx is the index of layers in up_blocks.3.
+                # Layers 22, 23 and 24 map to 0, 1 and 2.
+                cached_activations = attn_capture_procs[internal_idx].cached_activations
+                captured_activations[k][layer_idx] = cached_activations[k].to(out_dtype)
+    return captured_activations

adaface/face_id_to_ada_prompt.py CHANGED Viewed

@@ -53,6 +53,8 @@ class FaceID2AdaPrompt(nn.Module):
         self.text_to_image_prompt_encoder   = None
         self.tokenizer                      = None
         self.dtype                          = kwargs.get('dtype', torch.float16)
         # Load Img2Ada SubjectBasisGenerator.
         self.subject_string                 = kwargs.get('subject_string', 'z')
@@ -73,12 +75,16 @@ class FaceID2AdaPrompt(nn.Module):
         self.use_clip_embs                          = False
         self.do_contrast_clip_embs_on_bg_features   = False
         # num_id_vecs is the output embeddings of the ID2ImgPrompt module.
         # If there's no static image suffix embeddings, then num_id_vecs is also
         # the number of ada embeddings returned by the subject basis generator.
         # num_id_vecs will be set in each derived class.
         self.num_static_img_suffix_embs     = kwargs.get('num_static_img_suffix_embs', 0)
-        print(f'{self.name} Adaface uses {self.num_id_vecs} ID image embeddings and {self.num_static_img_suffix_embs} fixed image embeddings as input.')
         self.id_img_prompt_max_length       = 77
         self.face_id_dim                    = 512
@@ -87,36 +93,35 @@ class FaceID2AdaPrompt(nn.Module):
         self.clip_embedding_dim             = 1024
         self.output_dim                     = 768
-    def get_id2img_learnable_modules(self):
-        raise NotImplementedError
-    def load_id2img_learnable_modules(self, id2img_learnable_modules_state_dict_list):
-        id2img_prompt_encoder_learnable_modules = self.get_id2img_learnable_modules()
-        for module, state_dict in zip(id2img_prompt_encoder_learnable_modules, id2img_learnable_modules_state_dict_list):
-            module.load_state_dict(state_dict)
-        print(f'{len(id2img_prompt_encoder_learnable_modules)} ID2ImgPrompt encoder modules loaded.')
-    # init_subj_basis_generator() can only be called after the derived class is initialized,
-    # when self.num_id_vecs, self.num_static_img_suffix_embs and self.clip_embedding_dim have been set.
-    def init_subj_basis_generator(self):
         self.subj_basis_generator = \
-            SubjBasisGenerator(num_id_vecs = self.num_id_vecs,
                                num_static_img_suffix_embs = self.num_static_img_suffix_embs,
                                bg_image_embedding_dim = self.clip_embedding_dim,
                                output_dim = self.output_dim,
                                placeholder_is_bg = False,
-                               prompt2token_proj_grad_scale = 1,
                                bg_prompt_translator_has_to_out_proj=False)
     def load_adaface_ckpt(self, adaface_ckpt_path):
-        ckpt = torch.load(adaface_ckpt_path, map_location='cpu')
         string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
         if self.subject_string not in string_to_subj_basis_generator_dict:
             print(f"Subject '{self.subject_string}' not found in the embedding manager.")
             breakpoint()
         ckpt_subj_basis_generator = string_to_subj_basis_generator_dict[self.subject_string]
-        ckpt_subj_basis_generator.N_ID              = self.num_id_vecs
         # Since we directly use the subject basis generator object from the ckpt,
         # fixing the number of static image suffix embeddings is much simpler.
         # Otherwise if we want to load the subject basis generator from its state_dict,
@@ -129,7 +134,7 @@ class FaceID2AdaPrompt(nn.Module):
         ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.num_static_img_suffix_embs, img_prompt_dim=self.output_dim)
         # Fix missing variables in old ckpt.
         ckpt_subj_basis_generator.patch_old_subj_basis_generator_ckpt()
         self.subj_basis_generator.extend_prompt2token_proj_attention(\
             ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
         ret = self.subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict(), strict=False)
@@ -155,6 +160,11 @@ class FaceID2AdaPrompt(nn.Module):
         self.subj_basis_generator.freeze_prompt2token_proj()
     @torch.no_grad()
     def get_clip_neg_features(self, BS):
         if self.clip_neg_features is None:
@@ -220,6 +230,7 @@ class FaceID2AdaPrompt(nn.Module):
             image_obj, _, _ = pad_image_obj_to_square(image_obj)
             image_np = np.array(image_obj.resize(size, Image.NEAREST))
             face_info = self.face_app.get(cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))
             if len(face_info) > 0:
                 face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] # only use the maximum face
                 # id_emb: [512,]
@@ -487,12 +498,20 @@ class FaceID2AdaPrompt(nn.Module):
     # avg_at_stage == ada_prompt_emb usually produces the worst results.
     # avg_at_stage == id_emb is slightly better than img_prompt_emb, but sometimes img_prompt_emb is better.
     # p_dropout and return_zero_embs_for_dropped_encoders are only used by Joint_FaceID2AdaPrompt.
     def generate_adaface_embeddings(self, image_paths, face_id_embs=None, img_prompt_embs=None,
                                     p_dropout=0,
                                     return_zero_embs_for_dropped_encoders=True,
                                     avg_at_stage='id_emb', # id_emb, img_prompt_emb, or None.
                                     perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
-                                    perturb_std=0, enable_static_img_suffix_embs=False):
         if (avg_at_stage is None) or avg_at_stage.lower() == 'none':
             img_prompt_avg_at_stage = None
         else:
@@ -509,7 +528,7 @@ class FaceID2AdaPrompt(nn.Module):
                     id_batch_size = len(image_paths)
                 else:
                     id_batch_size = 1
             # faceid_embeds: [BS, 512] is a batch of extracted face analysis embeddings. NOT used later.
             # NOTE: If face_id_embs, image_paths and image_objs are all None,
             # then get_img_prompt_embs() generates random faceid_embeds/img_prompt_embs,
@@ -532,7 +551,7 @@ class FaceID2AdaPrompt(nn.Module):
                     verbose=True)
             if face_image_count == 0:
-                return None
         # No matter whether avg_at_stage is id_emb or img_prompt_emb, we average img_prompt_embs.
         elif avg_at_stage is not None and avg_at_stage.lower() != 'none':
@@ -545,19 +564,27 @@ class FaceID2AdaPrompt(nn.Module):
                                       out_id_embs_cfg_scale=self.out_id_embs_cfg_scale,
                                       is_face=True,
                                       enable_static_img_suffix_embs=enable_static_img_suffix_embs)
         # During training,  img_prompt_avg_at_stage is None, and BS >= 1.
         # During inference, img_prompt_avg_at_stage is 'id_emb' or 'img_prompt_emb', and BS == 1.
         if img_prompt_avg_at_stage is not None:
             # adaface_subj_embs: [1, 16, 768] -> [16, 768]
             adaface_subj_embs = adaface_subj_embs.squeeze(0)
-        return adaface_subj_embs
 class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
-    def __init__(self, *args, **kwargs):
-        self.name = 'arc2face'
-        self.num_id_vecs = 16
         super().__init__(*args, **kwargs)
         self.clip_image_encoder = CLIPVisionModelWithMask.from_pretrained('openai/clip-vit-large-patch14')
@@ -576,14 +603,11 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         '''
         # Use the same model as ID2AdaPrompt does.
         # FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
-        # Note there's a second "model" in the path.
-        # Note DON'T use CUDAExecutionProvider, as it will hang DDP training.
-        # Seems when loading insightface onto the GPU, it will only reside on the first GPU.
-        # Then the process on the second GPU has issue to communicate with insightface on the first GPU, causing hanging.
         self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
                                             providers=['CPUExecutionProvider'])
         self.face_app.prepare(ctx_id=0, det_size=(512, 512))
-        print(f'Face encoder loaded on CPU.')
         self.text_to_image_prompt_encoder = CLIPTextModelWrapper.from_pretrained(
                                                 'models/arc2face', subfolder="encoder",
@@ -594,21 +618,58 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         if self.out_id_embs_cfg_scale == -1:
             self.out_id_embs_cfg_scale = 1
         #### Arc2Face pipeline specific configs ####
-        self.gen_neg_img_prompt             = False
         # bg CLIP features are used by the bg subject basis generator.
-        self.use_clip_embs                  = True
         self.do_contrast_clip_embs_on_bg_features   = True
         # self.num_static_img_suffix_embs is initialized in the parent class.
-        self.id_img_prompt_max_length       = 22
-        self.clip_embedding_dim             = 1024
-        self.init_subj_basis_generator()
         if self.adaface_ckpt_path is not None:
             self.load_adaface_ckpt(self.adaface_ckpt_path)
-        print(f"{self.name} ada prompt encoder initialized, "
-              f"ID vecs: {self.num_id_vecs}, static suffix: {self.num_static_img_suffix_embs}.")
     # Arc2Face_ID2AdaPrompt never uses clip_features or called_for_neg_img_prompt.
     def map_init_id_to_img_prompt_embs(self, init_id_embs,
                                        clip_features=None,
@@ -656,16 +717,17 @@ class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
         # [N, 22, 768] -> [N, 16, 768]
         return prompt_embeds[:, 4:20]
-    def get_id2img_learnable_modules(self):
-        return [ self.text_to_image_prompt_encoder ]
 # ConsistentID_ID2AdaPrompt is just a wrapper of ConsistentIDPipeline, so it's not an nn.Module.
 class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
     def __init__(self, pipe=None, base_model_path="models/sd15-dste8-vae.safetensors",
                  *args, **kwargs):
-        self.name = 'consistentID'
-        self.num_id_vecs = 4
         super().__init__(*args, **kwargs)
         if pipe is None:
             # The base_model_path is kind of arbitrary, as the UNet and VAE in the model
@@ -712,13 +774,51 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
         self.clip_embedding_dim             = 1280
         self.s_scale                        = 1.0
         self.shortcut                       = False
-        self.init_subj_basis_generator()
         if self.adaface_ckpt_path is not None:
             self.load_adaface_ckpt(self.adaface_ckpt_path)
         print(f"{self.name} ada prompt encoder initialized, "
-              f"ID vecs: {self.num_id_vecs}, static suffix: {self.num_static_img_suffix_embs}.")
     def map_init_id_to_img_prompt_embs(self, init_id_embs,
                                        clip_features=None,
@@ -757,25 +857,30 @@ class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
         return global_id_embeds
-    def get_id2img_learnable_modules(self):
-        return [ self.image_proj_model ]
 # A wrapper for combining multiple FaceID2AdaPrompt instances.
 class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
     def __init__(self, adaface_encoder_types, adaface_ckpt_paths,
                  out_id_embs_cfg_scales=None, enabled_encoders=None,
                  *args, **kwargs):
         self.name = 'jointIDs'
         assert len(adaface_encoder_types) > 0, "adaface_encoder_types should not be empty."
-        adaface_encoder_types2num_id_vecs = { 'arc2face': 16, 'consistentID': 4 }
-        self.encoders_num_id_vecs = [ adaface_encoder_types2num_id_vecs[encoder_type] \
                                       for encoder_type in adaface_encoder_types ]
-        self.num_id_vecs = sum(self.encoders_num_id_vecs)
         super().__init__(*args, **kwargs)
         self.num_sub_encoders = len(adaface_encoder_types)
         self.id2ada_prompt_encoders = nn.ModuleList()
         self.encoders_num_static_img_suffix_embs = []
         # TODO: apply adaface_encoder_cfg_scales to influence the final prompt embeddings.
         # Now they are just placeholders.
@@ -785,10 +890,12 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
             self.out_id_embs_cfg_scales = [-1] * self.num_sub_encoders
         else:
             # Do not normalize the weights, and just use them as is.
-            self.out_id_embs_cfg_scales = out_id_embs_cfg_scales
         # Note we don't pass the adaface_ckpt_paths to the base class, but instead,
         # we load them once and for all in self.load_adaface_ckpt().
         for i, encoder_type in enumerate(adaface_encoder_types):
             kwargs['out_id_embs_cfg_scale'] = self.out_id_embs_cfg_scales[i]
             if encoder_type == 'arc2face':
@@ -797,8 +904,10 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
                 encoder = ConsistentID_ID2AdaPrompt(*args, **kwargs)
             else:
                 breakpoint()
             self.id2ada_prompt_encoders.append(encoder)
             self.encoders_num_static_img_suffix_embs.append(encoder.num_static_img_suffix_embs)
         self.num_static_img_suffix_embs     = sum(self.encoders_num_static_img_suffix_embs)
         # No need to set gen_neg_img_prompt, as we don't access it in this class, but rather
@@ -814,6 +923,7 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         # Therefore, the clip_embedding_dim is the sum of the clip_embedding_dims of all adaface encoders.
         self.clip_embedding_dims            = [encoder.clip_embedding_dim for encoder in self.id2ada_prompt_encoders]
         self.clip_embedding_dim             = sum(self.clip_embedding_dims)
         # The ctors of the derived classes have already initialized encoder.subj_basis_generator.
         # If subj_basis_generator expansion params are specified, they are equally applied to all adaface encoders.
         # This self.subj_basis_generator is not meant to be called as self.subj_basis_generator(), but instead,
@@ -821,12 +931,13 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         self.subj_basis_generator           = \
             nn.ModuleList( [encoder.subj_basis_generator for encoder \
                             in self.id2ada_prompt_encoders] )
         if adaface_ckpt_paths is not None:
             self.load_adaface_ckpt(adaface_ckpt_paths)
         print(f"{self.name} ada prompt encoder initialized with {self.num_sub_encoders} sub-encoders. "
-              f"ID vecs: {self.num_id_vecs}, static suffix embs: {self.num_static_img_suffix_embs}.")
         if enabled_encoders is not None:
             self.are_encoders_enabled = \
@@ -842,66 +953,79 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         else:
             self.are_encoders_enabled = \
                 torch.tensor([True] * self.num_sub_encoders)
     def load_adaface_ckpt(self, adaface_ckpt_paths):
-        # If only one adaface ckpt path is provided, then we assume it's the ckpt of the Joint_FaceID2AdaPrompt,
-        # so we dereference the list to get the actual path and load the subj_basis_generators of all adaface encoders.
         if isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
-            if len(adaface_ckpt_paths) == 1 and self.num_sub_encoders > 1:
                 adaface_ckpt_paths = adaface_ckpt_paths[0]
-        if isinstance(adaface_ckpt_paths, str):
-            # This is only applicable to newest ckpts of Joint_FaceID2AdaPrompt, where
-            # the ckpt_subj_basis_generator is an nn.ModuleList of multiple subj_basis_generators.
-            # Therefore, no need to patch missing variables.
-            ckpt = torch.load(adaface_ckpt_paths, map_location='cpu')
-            string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
-            if self.subject_string not in string_to_subj_basis_generator_dict:
-                print(f"Subject '{self.subject_string}' not found in the embedding manager.")
                 breakpoint()
-            ckpt_subj_basis_generators = string_to_subj_basis_generator_dict[self.subject_string]
-            for i, subj_basis_generator in enumerate(self.subj_basis_generator):
-                ckpt_subj_basis_generator = ckpt_subj_basis_generators[i]
-                # Handle differences in num_static_img_suffix_embs between the current model and the ckpt.
-                ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
-                                                                            img_prompt_dim=self.output_dim)
-                if subj_basis_generator.prompt2token_proj_attention_multipliers \
-                  == [1] * 12:
-                    subj_basis_generator.extend_prompt2token_proj_attention(\
-                        ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
-                elif subj_basis_generator.prompt2token_proj_attention_multipliers \
-                  != ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers:
-                    raise ValueError("Inconsistent prompt2token_proj_attention_multipliers.")
-                assert subj_basis_generator.prompt2token_proj_attention_multipliers \
-                    == ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, \
-                    "Inconsistent prompt2token_proj_attention_multipliers."
-                subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
-                # extend_prompt2token_proj_attention_multiplier is an integer >= 1.
-                # TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
-                # If extend_prompt2token_proj_attention_multiplier > 1, then after loading state_dict,
-                # extend subj_basis_generator again.
-                if self.extend_prompt2token_proj_attention_multiplier > 1:
-                    # During this extension, the added noise does change the extra copies of attention weights, since they are not in the ckpt.
-                    # During training,  prompt2token_proj_ext_attention_perturb_ratio == 0.1.
-                    # During inference, prompt2token_proj_ext_attention_perturb_ratio == 0.
-                    subj_basis_generator.extend_prompt2token_proj_attention(\
-                        None, -1, -1, self.extend_prompt2token_proj_attention_multiplier,
-                        perturb_std=self.prompt2token_proj_ext_attention_perturb_ratio)
-                subj_basis_generator.freeze_prompt2token_proj()
-            print(f"{adaface_ckpt_paths}: {len(self.subj_basis_generator)} subj_basis_generators loaded for {self.name}.")
-        elif isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
-            for i, ckpt_path in enumerate(adaface_ckpt_paths):
-                self.id2ada_prompt_encoders[i].load_adaface_ckpt(ckpt_path)
-        else:
             breakpoint()
     def extract_init_id_embeds_from_images(self, *args, **kwargs):
         total_faceless_img_count = 0
         all_id_embs = []
@@ -1039,7 +1163,7 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
             N_ID = self.encoders_num_id_vecs[i]
             if all_pos_prompt_embs[i] is None:
-                # Both pos_prompt_embs and neg_prompt_embs have N_ID == num_id_vecs embeddings.
                 all_pos_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
             if all_neg_prompt_embs[i] is None:
                 all_neg_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
@@ -1061,6 +1185,13 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         # So its .device is the device of its parameters.
         device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
         is_emb_averaged = kwargs.get('avg_at_stage', None) is not None
         BS = -1
         if face_id_embs is not None:
@@ -1068,13 +1199,17 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
             all_face_id_embs = face_id_embs.split(self.face_id_dims, dim=1)
         else:
             all_face_id_embs = [None] * self.num_sub_encoders
         if img_prompt_embs is not None:
             BS = img_prompt_embs.shape[0] if BS == -1 else BS
-            if img_prompt_embs.shape[1] != self.num_id_vecs:
                 breakpoint()
-            all_img_prompt_embs = img_prompt_embs.split(self.encoders_num_id_vecs, dim=1)
         else:
             all_img_prompt_embs = [None] * self.num_sub_encoders
         if image_paths is not None:
             BS = len(image_paths) if BS == -1 else BS
         if BS == -1:
@@ -1097,25 +1232,32 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         else:
             are_encoders_enabled = self.are_encoders_enabled
         all_adaface_subj_embs = []
         num_available_id_vecs = 0
         for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
             if not are_encoders_enabled[i]:
                 adaface_subj_embs = None
-                print(f"Encoder {id2ada_prompt_encoder.name} is dropped.")
             else:
                 # ddpm.embedding_manager.train() -> id2ada_prompt_encoder.train() -> each sub-enconder's train().
                 # -> each sub-enconder's subj_basis_generator.train().
                 # Therefore grad for the following call is enabled.
-                adaface_subj_embs = \
                     id2ada_prompt_encoder.generate_adaface_embeddings(image_paths,
                                                                       all_face_id_embs[i],
                                                                       all_img_prompt_embs[i],
                                                                       *args, **kwargs)
-            # adaface_subj_embs: [16, 768] or [4, 768].
-            N_ID = self.encoders_num_id_vecs[i]
             if adaface_subj_embs is None:
                 if not return_zero_embs_for_dropped_encoders:
                     continue
@@ -1126,12 +1268,16 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
                     all_adaface_subj_embs.append(adaface_subj_embs)
             else:
                 all_adaface_subj_embs.append(adaface_subj_embs)
                 num_available_id_vecs += N_ID
         # No faces are found in the images, so return None embeddings.
         # We don't want to return an all-zero embedding, which is useless.
         if num_available_id_vecs == 0:
-            return None
         # If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
         # during inference, we average across the batch dim.
@@ -1141,7 +1287,12 @@ class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
         # all_adaface_subj_embs[0]: [BS, 4, 768]. all_adaface_subj_embs[1]: [BS, 16, 768].
         # all_adaface_subj_embs: [BS, 20, 768].
         all_adaface_subj_embs = torch.cat(all_adaface_subj_embs, dim=-2)
-        return all_adaface_subj_embs
 '''

         self.text_to_image_prompt_encoder   = None
         self.tokenizer                      = None
         self.dtype                          = kwargs.get('dtype', torch.float16)
+        self.img2txt_dtype                  = kwargs.get('img2txt_dtype', torch.float16)
+        self.device                         = torch.device("cpu")
         # Load Img2Ada SubjectBasisGenerator.
         self.subject_string                 = kwargs.get('subject_string', 'z')
         self.use_clip_embs                          = False
         self.do_contrast_clip_embs_on_bg_features   = False
+        # Override the default setting in derived classes.
+        if 'enable_static_img_suffix_embs' in kwargs:
+            self.default_enable_static_img_suffix_embs = kwargs['enable_static_img_suffix_embs']
         # num_id_vecs is the output embeddings of the ID2ImgPrompt module.
         # If there's no static image suffix embeddings, then num_id_vecs is also
         # the number of ada embeddings returned by the subject basis generator.
         # num_id_vecs will be set in each derived class.
         self.num_static_img_suffix_embs     = kwargs.get('num_static_img_suffix_embs', 0)
+        print(f'{self.name} Adaface uses {self.num_id_vecs} ID image embeddings + {self.num_static_img_suffix_embs} fixed image embeddings as input.')
         self.id_img_prompt_max_length       = 77
         self.face_id_dim                    = 512
         self.clip_embedding_dim             = 1024
         self.output_dim                     = 768
+    # init_img2txt_projection() can only be called after the derived class is initialized,
+    # when self.num_id_vecs0, self.num_static_img_suffix_embs and self.clip_embedding_dim have been set.
+    def init_img2txt_projection(self):
         self.subj_basis_generator = \
+            SubjBasisGenerator(dtype=self.img2txt_dtype,
+                               num_id_vecs = self.num_id_vecs0,
                                num_static_img_suffix_embs = self.num_static_img_suffix_embs,
                                bg_image_embedding_dim = self.clip_embedding_dim,
                                output_dim = self.output_dim,
                                placeholder_is_bg = False,
                                bg_prompt_translator_has_to_out_proj=False)
     def load_adaface_ckpt(self, adaface_ckpt_path):
+        if isinstance(adaface_ckpt_path, (list, tuple, ListConfig)):
+            adaface_ckpt_path = adaface_ckpt_path[0]
+        ckpt = torch.load(adaface_ckpt_path, map_location='cpu', weights_only=False)
         string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
         if self.subject_string not in string_to_subj_basis_generator_dict:
             print(f"Subject '{self.subject_string}' not found in the embedding manager.")
             breakpoint()
         ckpt_subj_basis_generator = string_to_subj_basis_generator_dict[self.subject_string]
+        if isinstance(ckpt_subj_basis_generator, nn.ModuleList):
+            name2idx = { 'consistentID': 0, 'arc2face': 1 }
+            subj_basis_generator_idx = name2idx[self.name]
+            ckpt_subj_basis_generator = ckpt_subj_basis_generator[subj_basis_generator_idx]
+        ckpt_subj_basis_generator.N_ID              = self.num_id_vecs0
         # Since we directly use the subject basis generator object from the ckpt,
         # fixing the number of static image suffix embeddings is much simpler.
         # Otherwise if we want to load the subject basis generator from its state_dict,
         ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.num_static_img_suffix_embs, img_prompt_dim=self.output_dim)
         # Fix missing variables in old ckpt.
         ckpt_subj_basis_generator.patch_old_subj_basis_generator_ckpt()
         self.subj_basis_generator.extend_prompt2token_proj_attention(\
             ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
         ret = self.subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict(), strict=False)
         self.subj_basis_generator.freeze_prompt2token_proj()
+    def set_out_id_embs_cfg_scale(self, out_id_embs_cfg_scale):
+        if isinstance(out_id_embs_cfg_scale, (list, tuple, ListConfig)):
+            out_id_embs_cfg_scale = out_id_embs_cfg_scale[0]
+        self.out_id_embs_cfg_scale = out_id_embs_cfg_scale
     @torch.no_grad()
     def get_clip_neg_features(self, BS):
         if self.clip_neg_features is None:
             image_obj, _, _ = pad_image_obj_to_square(image_obj)
             image_np = np.array(image_obj.resize(size, Image.NEAREST))
             face_info = self.face_app.get(cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR))
             if len(face_info) > 0:
                 face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1] # only use the maximum face
                 # id_emb: [512,]
     # avg_at_stage == ada_prompt_emb usually produces the worst results.
     # avg_at_stage == id_emb is slightly better than img_prompt_emb, but sometimes img_prompt_emb is better.
     # p_dropout and return_zero_embs_for_dropped_encoders are only used by Joint_FaceID2AdaPrompt.
+    # enable_static_img_suffix_embs=None: use the default setting.
     def generate_adaface_embeddings(self, image_paths, face_id_embs=None, img_prompt_embs=None,
                                     p_dropout=0,
                                     return_zero_embs_for_dropped_encoders=True,
                                     avg_at_stage='id_emb', # id_emb, img_prompt_emb, or None.
                                     perturb_at_stage=None, # id_emb, img_prompt_emb, or None.
+                                    perturb_std=0, enable_static_img_suffix_embs=None):
+        if enable_static_img_suffix_embs is None:
+            enable_static_img_suffix_embs = self.default_enable_static_img_suffix_embs
+        lens_subj_emb_segments = [ self.num_id_vecs + enable_static_img_suffix_embs \
+                                                      * self.num_static_img_suffix_embs ]
         if (avg_at_stage is None) or avg_at_stage.lower() == 'none':
             img_prompt_avg_at_stage = None
         else:
                     id_batch_size = len(image_paths)
                 else:
                     id_batch_size = 1
             # faceid_embeds: [BS, 512] is a batch of extracted face analysis embeddings. NOT used later.
             # NOTE: If face_id_embs, image_paths and image_objs are all None,
             # then get_img_prompt_embs() generates random faceid_embeds/img_prompt_embs,
                     verbose=True)
             if face_image_count == 0:
+                return None, None, lens_subj_emb_segments
         # No matter whether avg_at_stage is id_emb or img_prompt_emb, we average img_prompt_embs.
         elif avg_at_stage is not None and avg_at_stage.lower() != 'none':
                                       out_id_embs_cfg_scale=self.out_id_embs_cfg_scale,
                                       is_face=True,
                                       enable_static_img_suffix_embs=enable_static_img_suffix_embs)
+        if self.num_id_vecs < self.num_id_vecs0:
+            adaface_subj_embs = adaface_subj_embs[:, :self.num_id_vecs, :]
         # During training,  img_prompt_avg_at_stage is None, and BS >= 1.
         # During inference, img_prompt_avg_at_stage is 'id_emb' or 'img_prompt_emb', and BS == 1.
         if img_prompt_avg_at_stage is not None:
             # adaface_subj_embs: [1, 16, 768] -> [16, 768]
             adaface_subj_embs = adaface_subj_embs.squeeze(0)
+        return adaface_subj_embs, img_prompt_embs, lens_subj_emb_segments
 class Arc2Face_ID2AdaPrompt(FaceID2AdaPrompt):
+    name = 'arc2face'
+    num_id_vecs0 = 16
+    # first 4 are kept, the rest 12 are averaged to another 4.
+    # Then concatenated to [8, 768].
+    num_id_vecs  = 16
+    default_enable_static_img_suffix_embs = False
+    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.clip_image_encoder = CLIPVisionModelWithMask.from_pretrained('openai/clip-vit-large-patch14')
         '''
         # Use the same model as ID2AdaPrompt does.
         # FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
+        # Note there are two "models" in the path.
         self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
                                             providers=['CPUExecutionProvider'])
         self.face_app.prepare(ctx_id=0, det_size=(512, 512))
+        print(f'Arc2Face Face encoder loaded on CPU.')
         self.text_to_image_prompt_encoder = CLIPTextModelWrapper.from_pretrained(
                                                 'models/arc2face', subfolder="encoder",
         if self.out_id_embs_cfg_scale == -1:
             self.out_id_embs_cfg_scale = 1
         #### Arc2Face pipeline specific configs ####
+        self.gen_neg_img_prompt                     = False
         # bg CLIP features are used by the bg subject basis generator.
+        self.use_clip_embs                          = True
         self.do_contrast_clip_embs_on_bg_features   = True
         # self.num_static_img_suffix_embs is initialized in the parent class.
+        self.id_img_prompt_max_length               = 22
+        self.clip_embedding_dim                     = 1024
+        self.init_img2txt_projection()
         if self.adaface_ckpt_path is not None:
             self.load_adaface_ckpt(self.adaface_ckpt_path)
+        for param in self.clip_image_encoder.parameters():
+            param.requires_grad = False
+        for param in self.text_to_image_prompt_encoder.parameters():
+            param.requires_grad = False
+        for param in self.subj_basis_generator.parameters():
+            param.requires_grad = self.is_training
+        print(f"{self.name} ada prompt encoder initialized, "
+              f"ID vecs: {self.num_id_vecs0}, static suffix: {self.num_static_img_suffix_embs}.")
+    def _apply(self, fn):
+        super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
+        return
+        # A dirty hack to get the device of the model, passed from
+        # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
+        test_tensor = torch.zeros(1)  # Create a test tensor
+        transformed_tensor = fn(test_tensor)  # Apply `fn()` to test it
+        device = transformed_tensor.device  # Get the device of the transformed tensor
+        # No need to reload face_app on the same device.
+        if device == self.device:
+            return
+        if str(device) == 'cpu':
+            self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
+                                        providers=['CPUExecutionProvider'])
+            self.face_app.prepare(ctx_id=0, det_size=(512, 512))
+        else:
+            device_id = device.index
+            self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface',
+                                         providers=['CUDAExecutionProvider'],
+                                         provider_options=[{"device_id": device_id,
+                                                            "cudnn_conv_algo_search": "HEURISTIC",
+                                                            "gpu_mem_limit": 2 * 1024**3
+                                                            }])
+            self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
+        self.device = device
+        print(f'Arc2Face Face encoder reloaded on {device}.')
+        return
     # Arc2Face_ID2AdaPrompt never uses clip_features or called_for_neg_img_prompt.
     def map_init_id_to_img_prompt_embs(self, init_id_embs,
                                        clip_features=None,
         # [N, 22, 768] -> [N, 16, 768]
         return prompt_embeds[:, 4:20]
 # ConsistentID_ID2AdaPrompt is just a wrapper of ConsistentIDPipeline, so it's not an nn.Module.
 class ConsistentID_ID2AdaPrompt(FaceID2AdaPrompt):
+    name = 'consistentID'
+    num_id_vecs0 = 4
+    # No compression for ConsistentID.
+    num_id_vecs  = 4
+    default_enable_static_img_suffix_embs = False
     def __init__(self, pipe=None, base_model_path="models/sd15-dste8-vae.safetensors",
                  *args, **kwargs):
         super().__init__(*args, **kwargs)
         if pipe is None:
             # The base_model_path is kind of arbitrary, as the UNet and VAE in the model
         self.clip_embedding_dim             = 1280
         self.s_scale                        = 1.0
         self.shortcut                       = False
+        self.init_img2txt_projection()
         if self.adaface_ckpt_path is not None:
             self.load_adaface_ckpt(self.adaface_ckpt_path)
+        for param in self.clip_image_encoder.parameters():
+            param.requires_grad = False
+        for param in self.image_proj_model.parameters():
+            param.requires_grad = False
+        for param in self.subj_basis_generator.parameters():
+            param.requires_grad = self.is_training
         print(f"{self.name} ada prompt encoder initialized, "
+              f"ID vecs: {self.num_id_vecs0}, static suffix: {self.num_static_img_suffix_embs}.")
+    def _apply(self, fn):
+        super()._apply(fn)  # Call the parent _apply to handle parameters and buffers
+        return
+        # A dirty hack to get the device of the model, passed from
+        # parent.model.to(self.root_device) => parent._apply(convert) => module._apply(fn)
+        test_tensor = torch.zeros(1)  # Create a test tensor
+        transformed_tensor = fn(test_tensor)  # Apply `fn()` to test it
+        device = transformed_tensor.device  # Get the device of the transformed tensor
+        # No need to reload face_app on the same device.
+        if device == self.device:
+            return
+        if str(device) == 'cpu':
+            self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
+                                         providers=['CPUExecutionProvider'])
+            self.face_app.prepare(ctx_id=0, det_size=(512, 512))
+        else:
+            device_id = device.index
+            self.face_app = FaceAnalysis(name='buffalo_l', root='models/insightface',
+                                         providers=['CUDAExecutionProvider'],
+                                         provider_options=[{"device_id": device_id,
+                                                            "cudnn_conv_algo_search": "HEURISTIC",
+                                                            "gpu_mem_limit": 2 * 1024**3
+                                                            }])
+            self.face_app.prepare(ctx_id=device_id, det_size=(512, 512))
+        self.device = device
+        self.pipe.face_app = self.face_app
+        print(f'ConsistentID Face encoder reloaded on {device}.')
     def map_init_id_to_img_prompt_embs(self, init_id_embs,
                                        clip_features=None,
         return global_id_embeds
 # A wrapper for combining multiple FaceID2AdaPrompt instances.
 class Joint_FaceID2AdaPrompt(FaceID2AdaPrompt):
     def __init__(self, adaface_encoder_types, adaface_ckpt_paths,
                  out_id_embs_cfg_scales=None, enabled_encoders=None,
                  *args, **kwargs):
         self.name = 'jointIDs'
+        name2class = { 'arc2face': Arc2Face_ID2AdaPrompt, 'consistentID': ConsistentID_ID2AdaPrompt }
         assert len(adaface_encoder_types) > 0, "adaface_encoder_types should not be empty."
+        adaface_encoder_types2num_id_vecs0 = { name: name2class[name].num_id_vecs0 for name in adaface_encoder_types }
+        adaface_encoder_types2num_id_vecs  = { name: name2class[name].num_id_vecs  for name in adaface_encoder_types }
+        # self.num_id_vecs0 is used in the parent class. So we need to initialize it here first.
+        self.encoders_num_id_vecs0 = [ adaface_encoder_types2num_id_vecs0[encoder_type] \
                                       for encoder_type in adaface_encoder_types ]
+        self.encoders_num_id_vecs  = [ adaface_encoder_types2num_id_vecs[encoder_type] \
+                                        for encoder_type in adaface_encoder_types ]
+        self.num_id_vecs0 = sum(self.encoders_num_id_vecs0)
+        self.num_id_vecs  = sum(self.encoders_num_id_vecs)
+        # super() sets self.is_training.
         super().__init__(*args, **kwargs)
         self.num_sub_encoders = len(adaface_encoder_types)
         self.id2ada_prompt_encoders = nn.ModuleList()
         self.encoders_num_static_img_suffix_embs = []
+        self.default_enable_static_img_suffix_embs = []
         # TODO: apply adaface_encoder_cfg_scales to influence the final prompt embeddings.
         # Now they are just placeholders.
             self.out_id_embs_cfg_scales = [-1] * self.num_sub_encoders
         else:
             # Do not normalize the weights, and just use them as is.
+            self.out_id_embs_cfg_scales = list(out_id_embs_cfg_scales)
         # Note we don't pass the adaface_ckpt_paths to the base class, but instead,
         # we load them once and for all in self.load_adaface_ckpt().
+        # NOTE: during inference, num_static_img_suffix_embs is fixed to be 4 for each encoder.
+        # But we can still disable static_img_suffix_embs by setting enable_static_img_suffix_embs to False.
         for i, encoder_type in enumerate(adaface_encoder_types):
             kwargs['out_id_embs_cfg_scale'] = self.out_id_embs_cfg_scales[i]
             if encoder_type == 'arc2face':
                 encoder = ConsistentID_ID2AdaPrompt(*args, **kwargs)
             else:
                 breakpoint()
             self.id2ada_prompt_encoders.append(encoder)
             self.encoders_num_static_img_suffix_embs.append(encoder.num_static_img_suffix_embs)
+            self.default_enable_static_img_suffix_embs.append(encoder.default_enable_static_img_suffix_embs)
         self.num_static_img_suffix_embs     = sum(self.encoders_num_static_img_suffix_embs)
         # No need to set gen_neg_img_prompt, as we don't access it in this class, but rather
         # Therefore, the clip_embedding_dim is the sum of the clip_embedding_dims of all adaface encoders.
         self.clip_embedding_dims            = [encoder.clip_embedding_dim for encoder in self.id2ada_prompt_encoders]
         self.clip_embedding_dim             = sum(self.clip_embedding_dims)
         # The ctors of the derived classes have already initialized encoder.subj_basis_generator.
         # If subj_basis_generator expansion params are specified, they are equally applied to all adaface encoders.
         # This self.subj_basis_generator is not meant to be called as self.subj_basis_generator(), but instead,
         self.subj_basis_generator           = \
             nn.ModuleList( [encoder.subj_basis_generator for encoder \
                             in self.id2ada_prompt_encoders] )
+        # load_adaface_ckpt() loads into self.subj_basis_generator. So we need to initialize self.subj_basis_generator first.
         if adaface_ckpt_paths is not None:
             self.load_adaface_ckpt(adaface_ckpt_paths)
         print(f"{self.name} ada prompt encoder initialized with {self.num_sub_encoders} sub-encoders. "
+              f"ID vecs: {self.num_id_vecs0}, static suffix embs: {self.num_static_img_suffix_embs}.")
         if enabled_encoders is not None:
             self.are_encoders_enabled = \
         else:
             self.are_encoders_enabled = \
                 torch.tensor([True] * self.num_sub_encoders)
     def load_adaface_ckpt(self, adaface_ckpt_paths):
         if isinstance(adaface_ckpt_paths, (list, tuple, ListConfig)):
+            # If multiple adaface ckpt paths are provided, then we assume they are the
+            # ckpts of the sub-encoders.
+            if len(adaface_ckpt_paths) == self.num_sub_encoders:
+                for i, ckpt_path in enumerate(adaface_ckpt_paths):
+                    self.id2ada_prompt_encoders[i].load_adaface_ckpt(ckpt_path)
+                return
+            # If only one adaface ckpt path is provided, then we assume it's the ckpt of the Joint_FaceID2AdaPrompt,
+            # so we dereference the list to get the actual path and load the subj_basis_generators of all adaface encoders.
+            elif len(adaface_ckpt_paths) == 1 and self.num_sub_encoders > 1:
                 adaface_ckpt_paths = adaface_ckpt_paths[0]
+            else:
                 breakpoint()
+        adaface_ckpt_path = adaface_ckpt_paths
+        assert isinstance(adaface_ckpt_path, str), "adaface_ckpt_path should be a string."
+        # This is only applicable to newest ckpts of Joint_FaceID2AdaPrompt, where
+        # the ckpt_subj_basis_generator is an nn.ModuleList of multiple subj_basis_generators.
+        # Therefore, no need to patch missing variables.
+        ckpt = torch.load(adaface_ckpt_paths, map_location='cpu', weights_only=False)
+        string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
+        if self.subject_string not in string_to_subj_basis_generator_dict:
+            print(f"Subject '{self.subject_string}' not found in the embedding manager.")
+            breakpoint()
+        ckpt_subj_basis_generators = string_to_subj_basis_generator_dict[self.subject_string]
+        if len(ckpt_subj_basis_generators) != self.num_sub_encoders:
+            print(f"Number of subj_basis_generators in the ckpt ({len(ckpt_subj_basis_generators)}) "
+                    f"doesn't match the number of adaface encoders ({self.num_sub_encoders}).")
             breakpoint()
+        for i, subj_basis_generator in enumerate(self.subj_basis_generator):
+            ckpt_subj_basis_generator = ckpt_subj_basis_generators[i]
+            # Handle differences in num_static_img_suffix_embs between the current model and the ckpt.
+            ckpt_subj_basis_generator.initialize_static_img_suffix_embs(self.encoders_num_static_img_suffix_embs[i],
+                                                                        img_prompt_dim=self.output_dim)
+            if subj_basis_generator.prompt2token_proj_attention_multipliers \
+                == [1] * 12:
+                subj_basis_generator.extend_prompt2token_proj_attention(\
+                    ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, -1, -1, 1, perturb_std=0)
+            elif subj_basis_generator.prompt2token_proj_attention_multipliers \
+                != ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers:
+                raise ValueError("Inconsistent prompt2token_proj_attention_multipliers.")
+            assert subj_basis_generator.prompt2token_proj_attention_multipliers \
+                == ckpt_subj_basis_generator.prompt2token_proj_attention_multipliers, \
+                "Inconsistent prompt2token_proj_attention_multipliers."
+            subj_basis_generator.load_state_dict(ckpt_subj_basis_generator.state_dict())
+            # extend_prompt2token_proj_attention_multiplier is an integer >= 1.
+            # TODO: extend_prompt2token_proj_attention_multiplier should be a list of integers.
+            # If extend_prompt2token_proj_attention_multiplier > 1, then after loading state_dict,
+            # extend subj_basis_generator again.
+            if self.extend_prompt2token_proj_attention_multiplier > 1:
+                # During this extension, the added noise does change the extra copies of attention weights, since they are not in the ckpt.
+                # During training,  prompt2token_proj_ext_attention_perturb_ratio == 0.1.
+                # During inference, prompt2token_proj_ext_attention_perturb_ratio == 0.
+                subj_basis_generator.extend_prompt2token_proj_attention(\
+                    None, -1, -1, self.extend_prompt2token_proj_attention_multiplier,
+                    perturb_std=self.prompt2token_proj_ext_attention_perturb_ratio)
+            subj_basis_generator.freeze_prompt2token_proj()
+        print(f"{adaface_ckpt_paths}: {len(self.subj_basis_generator)} subj_basis_generators loaded for {self.name}.")
+    def set_out_id_embs_cfg_scale(self, out_id_embs_cfg_scales):
+        self.out_id_embs_cfg_scales = list(out_id_embs_cfg_scales)
+        for i, out_id_embs_cfg_scale in enumerate(out_id_embs_cfg_scales):
+            self.id2ada_prompt_encoders[i].set_out_id_embs_cfg_scale(out_id_embs_cfg_scale)
     def extract_init_id_embeds_from_images(self, *args, **kwargs):
         total_faceless_img_count = 0
         all_id_embs = []
             N_ID = self.encoders_num_id_vecs[i]
             if all_pos_prompt_embs[i] is None:
+                # Both pos_prompt_embs and neg_prompt_embs have N_ID == num_id_vecs0 embeddings.
                 all_pos_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
             if all_neg_prompt_embs[i] is None:
                 all_neg_prompt_embs[i] = torch.zeros((BS, N_ID, 768), dtype=torch.float16, device=device)
         # So its .device is the device of its parameters.
         device = self.id2ada_prompt_encoders[0].clip_image_encoder.device
         is_emb_averaged = kwargs.get('avg_at_stage', None) is not None
+        if kwargs.get('enable_static_img_suffix_embs', None) is None:
+            enable_static_img_suffix_embs = self.default_enable_static_img_suffix_embs
+        else:
+            enable_static_img_suffix_embs = kwargs['enable_static_img_suffix_embs']
+            if isinstance(enable_static_img_suffix_embs, bool):
+                enable_static_img_suffix_embs = [enable_static_img_suffix_embs] * self.num_sub_encoders
         BS = -1
         if face_id_embs is not None:
             all_face_id_embs = face_id_embs.split(self.face_id_dims, dim=1)
         else:
             all_face_id_embs = [None] * self.num_sub_encoders
         if img_prompt_embs is not None:
             BS = img_prompt_embs.shape[0] if BS == -1 else BS
+            if img_prompt_embs.shape[1] != self.num_id_vecs0:
                 breakpoint()
+            all_img_prompt_embs = img_prompt_embs.split(self.encoders_num_id_vecs0, dim=1)
+            img_prompt_embs_provided = True
         else:
             all_img_prompt_embs = [None] * self.num_sub_encoders
+            img_prompt_embs_provided = False
         if image_paths is not None:
             BS = len(image_paths) if BS == -1 else BS
         if BS == -1:
         else:
             are_encoders_enabled = self.are_encoders_enabled
+        self.curr_are_encoders_enabled = are_encoders_enabled
         all_adaface_subj_embs = []
         num_available_id_vecs = 0
+        lens_subj_emb_segments = []
         for i, id2ada_prompt_encoder in enumerate(self.id2ada_prompt_encoders):
             if not are_encoders_enabled[i]:
                 adaface_subj_embs = None
+                print(f"Encoder {id2ada_prompt_encoder.name} is disabled.")
+                N_ID = id2ada_prompt_encoder.num_id_vecs + enable_static_img_suffix_embs[i] \
+                                                           * id2ada_prompt_encoder.num_static_img_suffix_embs
             else:
+                kwargs['enable_static_img_suffix_embs'] = enable_static_img_suffix_embs[i]
                 # ddpm.embedding_manager.train() -> id2ada_prompt_encoder.train() -> each sub-enconder's train().
                 # -> each sub-enconder's subj_basis_generator.train().
                 # Therefore grad for the following call is enabled.
+                adaface_subj_embs, img_prompt_embs, encoder_lens_subj_emb_segments = \
                     id2ada_prompt_encoder.generate_adaface_embeddings(image_paths,
                                                                       all_face_id_embs[i],
                                                                       all_img_prompt_embs[i],
                                                                       *args, **kwargs)
+                # adaface_subj_embs: arc2face [16, 768] or consistentID [4, 768],
+                # or arc2face [20, 768] or consistentID [8, 768] if enable_static_img_suffix_embs=True.
+                N_ID = encoder_lens_subj_emb_segments[0]
             if adaface_subj_embs is None:
                 if not return_zero_embs_for_dropped_encoders:
                     continue
                     all_adaface_subj_embs.append(adaface_subj_embs)
             else:
                 all_adaface_subj_embs.append(adaface_subj_embs)
+                if not img_prompt_embs_provided:
+                    all_img_prompt_embs[i] = img_prompt_embs
                 num_available_id_vecs += N_ID
+            lens_subj_emb_segments.append(N_ID)
         # No faces are found in the images, so return None embeddings.
         # We don't want to return an all-zero embedding, which is useless.
         if num_available_id_vecs == 0:
+            return None, [0]
         # If id2ada_prompt_encoders are ["arc2face", "consistentID"], then
         # during inference, we average across the batch dim.
         # all_adaface_subj_embs[0]: [BS, 4, 768]. all_adaface_subj_embs[1]: [BS, 16, 768].
         # all_adaface_subj_embs: [BS, 20, 768].
         all_adaface_subj_embs = torch.cat(all_adaface_subj_embs, dim=-2)
+        # Check if some of the img_prompt_embs are None.
+        if None in all_img_prompt_embs:
+            all_img_prompt_embs = None
+        else:
+            all_img_prompt_embs   = torch.cat(all_img_prompt_embs, dim=-2)
+        return all_adaface_subj_embs, all_img_prompt_embs, lens_subj_emb_segments
 '''

adaface/subj_basis_generator.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch
 from torch import nn
 from einops import rearrange
 from einops.layers.torch import Rearrange
-from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextConfig
 from torch import einsum
 from adaface.util import gen_gradient_scaler
@@ -57,7 +57,25 @@ class IP_MLPProjModel(nn.Module):
         x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
         x = self.norm(x)
         return x
 # group_dim: the tensor dimension that corresponds to the multiple groups.
 class LearnedSoftAggregate(nn.Module):
     def __init__(self, num_feat, group_dim, keepdim=False):
@@ -349,23 +367,26 @@ class CrossAttention(nn.Module):
         else:
             return out
 class ImgPrompt2TextPrompt(nn.Module):
-    def __init__(self, placeholder_is_bg, num_id_vecs, dtype=torch.float32, *args, **kwargs):
         super().__init__()
         self.N_ID  = num_id_vecs
         # If not placeholder_is_bg, then N_SFX will be updated in initialize_text_components().
         self.N_SFX = 0
         if not placeholder_is_bg:
-            self.initialize_text_components(*args, **kwargs)
         # prompt2token_proj: arc2face_models.py CLIPTextModelWrapper instance with **custom weights**.
         # prompt2token_proj is with the same architecture as the original arc2face text encoder,
         # but retrained to do inverse mapping.
         # To be initialized in the subclass.
         self.prompt2token_proj = None
-        self.dtype = dtype
     def initialize_static_img_suffix_embs(self, num_static_img_suffix_embs, img_prompt_dim=768):
         self.N_SFX = num_static_img_suffix_embs
         # We always take the first num_static_img_suffix_embs embeddings out of static_img_suffix_embs.
@@ -376,11 +397,11 @@ class ImgPrompt2TextPrompt(nn.Module):
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs ({self.N_SFX} required). Skip initialization.")
             elif self.static_img_suffix_embs.shape[1] < self.N_SFX:
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (< {self.N_SFX} required). Reinitialize.")
-                self.static_img_suffix_embs = nn.Parameter(torch.randn(1, self.N_SFX, img_prompt_dim))
             elif self.N_SFX > 0:
                 # self.static_img_suffix_embs.shape[1] > self.N_SFX > 0.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (> {self.N_SFX} required). Truncate.")
-                self.static_img_suffix_embs = nn.Parameter(self.static_img_suffix_embs[:, :self.N_SFX])
             else:
                 # self.static_img_suffix_embs.shape[1] > self.N_SFX == 0.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (0 required). Erase.")
@@ -391,7 +412,7 @@ class ImgPrompt2TextPrompt(nn.Module):
                 # or it's initialized but has fewer than num_static_img_suffix_embs embeddings (this situation should be very rare,
                 # so we don't consider to reuse and extend a shorter static_img_suffix_embs).
                 # So we reinitialize it.
-                self.static_img_suffix_embs = nn.Parameter(torch.randn(1, self.N_SFX, img_prompt_dim))
             else:
                 # If static_img_suffix_embs had been initialized, then it will be set to None, i.e., erased from the SubjBasisGenerator instance.
                 self.static_img_suffix_embs = None
@@ -399,9 +420,7 @@ class ImgPrompt2TextPrompt(nn.Module):
     # Implement a separate initialization function, so that it can be called from SubjBasisGenerator
     # after the SubjBasisGenerator is initialized. This can be used to fix old SubjBasisGenerator
     # ckpts which were not subclassed from ImgPrompt2TextPrompt.
-    def initialize_text_components(self, max_prompt_length=77, num_id_vecs=16,
-                                   num_static_img_suffix_embs=0, img_prompt_dim=768):
-        self.initialize_static_img_suffix_embs(num_static_img_suffix_embs, img_prompt_dim)
         self.max_prompt_length = max_prompt_length
         self.tokenizer       = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         # clip_text_embeddings: CLIPTextEmbeddings instance.
@@ -416,7 +435,7 @@ class ImgPrompt2TextPrompt(nn.Module):
         # pad_embeddings is still on CPU. But should be moved to GPU automatically.
         # Note: detach pad_embeddings from the computation graph, otherwise
         # deepcopy() in embedding_manager.py:make_frozen_copy_of_subj_basis_generators() will fail.
-        self.pad_embeddings = clip_text_embeddings(pad_tokens)[0].detach()
     # image prompt space -> text prompt space.
     # return_emb_types: a list of strings, each string is among
@@ -439,7 +458,7 @@ class ImgPrompt2TextPrompt(nn.Module):
                     else:
                         breakpoint()
                 else:
-                    # len(face_prompt_embs) == 1, this occurs when same_subject_in_batch == True, e.g. in do_comp_prompt_distillation.
                     # But list_extra_words always corresponds to the actual batch size. So we only take the first element.
                     list_extra_words = list_extra_words[:1]
@@ -466,7 +485,7 @@ class ImgPrompt2TextPrompt(nn.Module):
         face_prompt_embs_orig_dtype = face_prompt_embs.dtype
         face_prompt_embs            = face_prompt_embs.to(self.dtype)
-        ID_END      = 4 + self.N_ID
         PAD_BEGIN   = ID_END + self.N_SFX + 2
         # token_embs: [1, 77, 768]. This call is only to get the template token embeddings (the shallowest mapping).
@@ -545,6 +564,7 @@ class ImgPrompt2TextPrompt(nn.Module):
 class SubjBasisGenerator(ImgPrompt2TextPrompt):
     def __init__(
         self,
         # number of cross-attention heads of the bg prompt translator.
         # Taken as a half of the number of heads 12 of OpenAI clip-vit-large-patch14:
         # https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
@@ -553,22 +573,25 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
         # or number of background input identity vectors (no matter the subject is face or not).
         # 257: 257 CLIP tokens.
         num_nonface_in_id_vecs={ 'subj': 77, 'bg': 257 },
         num_id_vecs=16,                             # num_id_vecs: subj: 16. bg: 4.
         num_static_img_suffix_embs: int = 0,        # Number of extra static learnable image embeddings appended to translated ID embeddings.
         bg_image_embedding_dim=1024,                # CLIP image hidden layer feature dimension, as per config.json above.
         obj_embedding_dim=384,                      # DINO object feature dimension for objects.
         output_dim=768,                             # CLIP text embedding input dimension.
         placeholder_is_bg: bool = False,            # Whether the placeholder is for the image background tokens.
-        prompt2token_proj_grad_scale: float = 0.4,  # Gradient scale for prompt2token_proj.
         learnable_hidden_state_weights_scheme: str = 'per-layer',   # none, per-layer.
-        bg_prompt_translator_has_to_out_proj: bool = False,         # Whether the prompt_trans_layers have a to_out projection.
     ):
         # If not placeholder_is_bg, then it calls initialize_text_components() in the superclass.
-        super().__init__(placeholder_is_bg=placeholder_is_bg, num_id_vecs=num_id_vecs, max_prompt_length=77,
-                         num_static_img_suffix_embs=num_static_img_suffix_embs, img_prompt_dim=output_dim)
         self.placeholder_is_bg  = placeholder_is_bg
         self.num_out_embs       = self.N_ID + self.N_SFX
         self.output_dim         = output_dim
         # num_nonface_in_id_vecs should be the number of core ID embs, 16.
@@ -586,14 +609,18 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
             # self.prompt2token_proj: [1, 16, 768] -> [1, 77, 768] (with paddings) or [1, 16, 768] (without paddings).
             # If self.placeholder_is_bg: prompt2token_proj is set to None.
             # Use an attention dropout of 0.2 to increase robustness.
-            clip_dropout_config     = None #CLIPTextConfig.from_pretrained('openai/clip-vit-large-patch14', attention_dropout=0.05, dropout=0.05)
-            self.prompt2token_proj  = CLIPTextModelWrapper.from_pretrained('openai/clip-vit-large-patch14',
-                                                                           config=clip_dropout_config)
-            self.prompt2token_proj_grad_scale   = prompt2token_proj_grad_scale
-            self.prompt2token_proj_grad_scaler  = gen_gradient_scaler(prompt2token_proj_grad_scale)
-            print(f"Subj prompt2token_proj initialized with grad scale of {prompt2token_proj_grad_scale}.")
-            # If prompt2token_proj_grad_scale is 0, freeze all params in prompt2token_proj.
-            # Otherwise, only freeze token and positional embeddings of the original CLIPTextModel.
             self.freeze_prompt2token_proj()
             # These multipliers are relative to the original CLIPTextModel.
@@ -631,6 +658,9 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
                                identity_to_out=identity_to_out,
                                out_has_skip=out_has_skip)
             self.output_scale = output_dim ** -0.5
             '''
@@ -686,21 +716,20 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
                 hidden_state_layer_weights = self.hidden_state_layer_weights_grad_scaler(self.hidden_state_layer_weights)
                 # faceid2img_prompt_embs -> ada_id_embs: image prompt space -> text prompt space.
-                with torch.set_grad_enabled(self.training and self.prompt2token_proj_grad_scale != 0):
-                    # If list_extra_words is not None, then ada_id_embs: [BS, 18, 768], three leading words, the 16 identity tokens
-                    # and (at most) two extra words in adaface_prompt_embs, without BOS and EOS.
-                    # If list_extra_words is None, then ada_id_embs: [BS, 16, 768], the 16 identity tokens in adaface_prompt_embs.
-                    # hidden_state_layer_weights: [[0.9163], [0.9483], [2.0762]]
-                    # ada_id_embs: [BS, 16, 768].
-                    # return_emb_types: a list of strings, each string is among
-                    # ['full', 'core', 'full_pad', 'full_half_pad'].
-                    ada_id_embs, = \
-                        self.inverse_img_prompt_embs(faceid2img_prompt_embs,
-                                                     list_extra_words=None,
-                                                     return_emb_types=['core'],
-                                                     hidden_state_layer_weights=hidden_state_layer_weights,
-                                                     enable_static_img_suffix_embs=enable_static_img_suffix_embs)
-                ada_id_embs = self.prompt2token_proj_grad_scaler(ada_id_embs)
             elif raw_id_embs is not None:
                 # id_embs: [BS, 384] -> [BS, 18, 768].
                 # obj_proj_in is expected to project the DINO object features to
@@ -726,14 +755,15 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
             adaface_out_embs = id_embs_out * self.output_scale    # * 0.036
         else:
-            adaface_out_embs = ada_id_embs
             # If out_id_embs_cfg_scale < 1, adaface_out_embs is a mix of adaface_out_embs and pad_embeddings.
             if out_id_embs_cfg_scale != 1:
-                # pad_embeddings: [77, 768] -> [16, 768] -> [1, 16, 768].
                 # NOTE: Never do cfg on static image suffix embeddings.
                 # So we take self.N_ID embeddings, instead of self.N_ID + self.N_SFX,
                 # even if enable_static_img_suffix_embs=True.
-                pad_embeddings = self.pad_embeddings[4:4+self.N_ID].unsqueeze(0).to(ada_id_embs.device)
                 adaface_out_embs[:, :self.N_ID] = ada_id_embs[:, :self.N_ID] * out_id_embs_cfg_scale \
                                                   + pad_embeddings           * (1 - out_id_embs_cfg_scale)
@@ -812,37 +842,37 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
         # Only applicable to fg basis generator.
         if self.placeholder_is_bg:
             return
-        # If bg, then prompt2token_proj is set to None. Therefore no need to freeze it.
-        # Then we don't have to check whether it's for subj or bg.
-        if self.prompt2token_proj_grad_scale == 0:
-            frozen_components_name = 'all'
-            frozen_param_set = self.prompt2token_proj.named_parameters()
-        else:
-            frozen_components_name = 'token_pos_embeddings'
-            frozen_param_set = self.prompt2token_proj.text_model.embeddings.named_parameters()
         if self.prompt2token_proj is not None:
             frozen_param_names = []
-            for param_name, param in frozen_param_set:
                 if param.requires_grad:
                     param.requires_grad = False
                     frozen_param_names.append(param_name)
                 # If param is already frozen, then no need to freeze it again.
-            print(f"{frozen_components_name} {len(frozen_param_names)} params in Subj prompt2token_proj is frozen.")
             #print(f"Frozen parameters:\n{frozen_param_names}")
     def patch_old_subj_basis_generator_ckpt(self):
         # Fix compatability with the previous version.
         if not hasattr(self, 'bg_prompt_translator_has_to_out_proj'):
             self.bg_prompt_translator_has_to_out_proj = False
-        if not hasattr(self, 'num_out_embs'):
-            self.num_out_embs = -1
         if hasattr(self, 'num_id_vecs') and not hasattr(self, 'N_ID'):
             self.N_ID = self.num_id_vecs
         if not hasattr(self, 'num_nonface_in_id_vecs') and hasattr(self, 'N_ID'):
             self.num_nonface_in_id_vecs = self.N_ID
         if not hasattr(self, 'dtype'):
-            self.dtype = torch.float32
         if self.placeholder_is_bg:
             if not hasattr(self, 'pos_embs') or self.pos_embs is None:
@@ -860,6 +890,14 @@ class SubjBasisGenerator(ImgPrompt2TextPrompt):
                                             num_static_img_suffix_embs=self.N_SFX,
                                             img_prompt_dim=self.output_dim)
     def __repr__(self):
         type_sig = 'subj' if not self.placeholder_is_bg else 'bg'

 from torch import nn
 from einops import rearrange
 from einops.layers.torch import Rearrange
+from transformers import CLIPTokenizer, CLIPTextModel
 from torch import einsum
 from adaface.util import gen_gradient_scaler
         x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
         x = self.norm(x)
         return x
+class LayerwiseMLPProjWithSkip(nn.Module):
+    def __init__(self, id_embeddings_dim=768, num_layers=16, dim_mult=2):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(id_embeddings_dim, id_embeddings_dim*dim_mult*num_layers),
+            Rearrange('b n (l d) -> b n l d', l=num_layers, d=id_embeddings_dim*dim_mult),
+            nn.GELU(),
+            nn.Linear(id_embeddings_dim*dim_mult, id_embeddings_dim),
+        )
+        self.norm = nn.LayerNorm(id_embeddings_dim)
+    def forward(self, id_embeds):
+        # B N D -> B N L D + B N L D -> B N L D
+        x = self.proj(id_embeds) + id_embeds.unsqueeze(1)
+        x = self.norm(x)
+        return x
 # group_dim: the tensor dimension that corresponds to the multiple groups.
 class LearnedSoftAggregate(nn.Module):
     def __init__(self, num_feat, group_dim, keepdim=False):
         else:
             return out
 class ImgPrompt2TextPrompt(nn.Module):
+    def __init__(self, placeholder_is_bg, num_id_vecs, num_static_img_suffix_embs,
+                 max_prompt_length=77, img_prompt_dim=768, dtype=torch.float16):
         super().__init__()
         self.N_ID  = num_id_vecs
         # If not placeholder_is_bg, then N_SFX will be updated in initialize_text_components().
         self.N_SFX = 0
+        self.dtype = dtype
         if not placeholder_is_bg:
+            self.initialize_static_img_suffix_embs(num_static_img_suffix_embs, img_prompt_dim)
+            self.initialize_text_components(max_prompt_length)
         # prompt2token_proj: arc2face_models.py CLIPTextModelWrapper instance with **custom weights**.
         # prompt2token_proj is with the same architecture as the original arc2face text encoder,
         # but retrained to do inverse mapping.
         # To be initialized in the subclass.
         self.prompt2token_proj = None
     def initialize_static_img_suffix_embs(self, num_static_img_suffix_embs, img_prompt_dim=768):
         self.N_SFX = num_static_img_suffix_embs
         # We always take the first num_static_img_suffix_embs embeddings out of static_img_suffix_embs.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs ({self.N_SFX} required). Skip initialization.")
             elif self.static_img_suffix_embs.shape[1] < self.N_SFX:
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (< {self.N_SFX} required). Reinitialize.")
+                self.static_img_suffix_embs = nn.Parameter(torch.randn(1, self.N_SFX, img_prompt_dim, dtype=self.dtype))
             elif self.N_SFX > 0:
                 # self.static_img_suffix_embs.shape[1] > self.N_SFX > 0.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (> {self.N_SFX} required). Truncate.")
+                self.static_img_suffix_embs = nn.Parameter(self.static_img_suffix_embs[:, :self.N_SFX].to(dtype=self.dtype))
             else:
                 # self.static_img_suffix_embs.shape[1] > self.N_SFX == 0.
                 print(f"static_img_suffix_embs had been initialized to be {self.static_img_suffix_embs.shape[1]} vecs (0 required). Erase.")
                 # or it's initialized but has fewer than num_static_img_suffix_embs embeddings (this situation should be very rare,
                 # so we don't consider to reuse and extend a shorter static_img_suffix_embs).
                 # So we reinitialize it.
+                self.static_img_suffix_embs = nn.Parameter(torch.randn(1, self.N_SFX, img_prompt_dim, dtype=self.dtype))
             else:
                 # If static_img_suffix_embs had been initialized, then it will be set to None, i.e., erased from the SubjBasisGenerator instance.
                 self.static_img_suffix_embs = None
     # Implement a separate initialization function, so that it can be called from SubjBasisGenerator
     # after the SubjBasisGenerator is initialized. This can be used to fix old SubjBasisGenerator
     # ckpts which were not subclassed from ImgPrompt2TextPrompt.
+    def initialize_text_components(self, max_prompt_length=77):
         self.max_prompt_length = max_prompt_length
         self.tokenizer       = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
         # clip_text_embeddings: CLIPTextEmbeddings instance.
         # pad_embeddings is still on CPU. But should be moved to GPU automatically.
         # Note: detach pad_embeddings from the computation graph, otherwise
         # deepcopy() in embedding_manager.py:make_frozen_copy_of_subj_basis_generators() will fail.
+        self.pad_embeddings = clip_text_embeddings(pad_tokens)[0].detach().to(self.dtype)
     # image prompt space -> text prompt space.
     # return_emb_types: a list of strings, each string is among
                     else:
                         breakpoint()
                 else:
+                    # len(face_prompt_embs) == 1, this occurs when same_subject_in_batch == True, e.g. in do_feat_distill_on_comp_prompt.
                     # But list_extra_words always corresponds to the actual batch size. So we only take the first element.
                     list_extra_words = list_extra_words[:1]
         face_prompt_embs_orig_dtype = face_prompt_embs.dtype
         face_prompt_embs            = face_prompt_embs.to(self.dtype)
+        ID_END      = 4      + self.N_ID
         PAD_BEGIN   = ID_END + self.N_SFX + 2
         # token_embs: [1, 77, 768]. This call is only to get the template token embeddings (the shallowest mapping).
 class SubjBasisGenerator(ImgPrompt2TextPrompt):
     def __init__(
         self,
+        dtype=torch.float16,
         # number of cross-attention heads of the bg prompt translator.
         # Taken as a half of the number of heads 12 of OpenAI clip-vit-large-patch14:
         # https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
         # or number of background input identity vectors (no matter the subject is face or not).
         # 257: 257 CLIP tokens.
         num_nonface_in_id_vecs={ 'subj': 77, 'bg': 257 },
+        num_ca_layers=16,
         num_id_vecs=16,                             # num_id_vecs: subj: 16. bg: 4.
         num_static_img_suffix_embs: int = 0,        # Number of extra static learnable image embeddings appended to translated ID embeddings.
         bg_image_embedding_dim=1024,                # CLIP image hidden layer feature dimension, as per config.json above.
         obj_embedding_dim=384,                      # DINO object feature dimension for objects.
         output_dim=768,                             # CLIP text embedding input dimension.
+        use_layerwise_proj: bool = False,           # Whether to use layerwise projection.
         placeholder_is_bg: bool = False,            # Whether the placeholder is for the image background tokens.
         learnable_hidden_state_weights_scheme: str = 'per-layer',   # none, per-layer.
+        bg_prompt_translator_has_to_out_proj:  bool = False,         # Whether the prompt_trans_layers have a to_out projection.
     ):
         # If not placeholder_is_bg, then it calls initialize_text_components() in the superclass.
+        super().__init__(placeholder_is_bg=placeholder_is_bg, num_id_vecs=num_id_vecs,
+                         num_static_img_suffix_embs=num_static_img_suffix_embs,
+                         max_prompt_length=77, img_prompt_dim=output_dim, dtype=dtype)
         self.placeholder_is_bg  = placeholder_is_bg
+        self.num_ca_layers      = num_ca_layers
         self.num_out_embs       = self.N_ID + self.N_SFX
         self.output_dim         = output_dim
         # num_nonface_in_id_vecs should be the number of core ID embs, 16.
             # self.prompt2token_proj: [1, 16, 768] -> [1, 77, 768] (with paddings) or [1, 16, 768] (without paddings).
             # If self.placeholder_is_bg: prompt2token_proj is set to None.
             # Use an attention dropout of 0.2 to increase robustness.
+            self.prompt2token_proj  = CLIPTextModelWrapper.from_pretrained('openai/clip-vit-large-patch14')
+            self.prompt2token_proj.to(dtype=self.dtype)
+            if use_layerwise_proj:
+                # MLPProjWithSkip: MLP with skip connection.
+                # [BS, 4, 768] -> [BS, 16, 4, 768]. Extra 16: 16 layers.
+                self.layerwise_proj     = LayerwiseMLPProjWithSkip(output_dim, dim_mult=2)
+            else:
+                self.layerwise_proj     = nn.Identity() #Rearrange('b n d -> b l n d', l=16)
+            print(f"Subj prompt2token_proj initialized.")
+            # Only freeze token and positional embeddings of the original CLIPTextModel.
             self.freeze_prompt2token_proj()
             # These multipliers are relative to the original CLIPTextModel.
                                identity_to_out=identity_to_out,
                                out_has_skip=out_has_skip)
+            if self.dtype == torch.float16:
+                self.prompt_translator = self.prompt_translator.half()
             self.output_scale = output_dim ** -0.5
             '''
                 hidden_state_layer_weights = self.hidden_state_layer_weights_grad_scaler(self.hidden_state_layer_weights)
                 # faceid2img_prompt_embs -> ada_id_embs: image prompt space -> text prompt space.
+                # inverse_img_prompt_embs() applies self.prompt2token_proj to faceid2img_prompt_embs.
+                # If list_extra_words is not None, then ada_id_embs: [BS, 18, 768], three leading words, the 16 identity tokens
+                # and (at most) two extra words in adaface_prompt_embs, without BOS and EOS.
+                # If list_extra_words is None, then ada_id_embs: [BS, 16, 768], the 16 identity tokens in adaface_prompt_embs.
+                # hidden_state_layer_weights: [[0.9163], [0.9483], [2.0762]]
+                # ada_id_embs: [BS, 16, 768].
+                # return_emb_types: a list of strings, each string is among
+                # ['full', 'core', 'full_pad', 'full_half_pad'].
+                ada_id_embs, = \
+                    self.inverse_img_prompt_embs(faceid2img_prompt_embs,
+                                                 list_extra_words=None,
+                                                 return_emb_types=['core'],
+                                                 hidden_state_layer_weights=hidden_state_layer_weights,
+                                                 enable_static_img_suffix_embs=enable_static_img_suffix_embs)
             elif raw_id_embs is not None:
                 # id_embs: [BS, 384] -> [BS, 18, 768].
                 # obj_proj_in is expected to project the DINO object features to
             adaface_out_embs = id_embs_out * self.output_scale    # * 0.036
         else:
+            # [BS, 16, 768] -> [BS, layers=16, tokens=16, 768]
+            adaface_out_embs = self.layerwise_proj(ada_id_embs)
             # If out_id_embs_cfg_scale < 1, adaface_out_embs is a mix of adaface_out_embs and pad_embeddings.
             if out_id_embs_cfg_scale != 1:
+                # pad_embeddings: [77, 768] -> [16, 768] -> [1, 1, 16, 768].
                 # NOTE: Never do cfg on static image suffix embeddings.
                 # So we take self.N_ID embeddings, instead of self.N_ID + self.N_SFX,
                 # even if enable_static_img_suffix_embs=True.
+                pad_embeddings = self.pad_embeddings[4:4+self.N_ID].unsqueeze(0).unsqueeze(1).to(ada_id_embs.device)
                 adaface_out_embs[:, :self.N_ID] = ada_id_embs[:, :self.N_ID] * out_id_embs_cfg_scale \
                                                   + pad_embeddings           * (1 - out_id_embs_cfg_scale)
         # Only applicable to fg basis generator.
         if self.placeholder_is_bg:
             return
         if self.prompt2token_proj is not None:
             frozen_param_names = []
+            for param_name, param in self.prompt2token_proj.text_model.embeddings.named_parameters():
                 if param.requires_grad:
                     param.requires_grad = False
                     frozen_param_names.append(param_name)
                 # If param is already frozen, then no need to freeze it again.
+            print(f"{len(frozen_param_names)} params of token_pos_embeddings in Subj prompt2token_proj is frozen.")
             #print(f"Frozen parameters:\n{frozen_param_names}")
     def patch_old_subj_basis_generator_ckpt(self):
         # Fix compatability with the previous version.
         if not hasattr(self, 'bg_prompt_translator_has_to_out_proj'):
             self.bg_prompt_translator_has_to_out_proj = False
         if hasattr(self, 'num_id_vecs') and not hasattr(self, 'N_ID'):
             self.N_ID = self.num_id_vecs
+        # Update the number of output embeddings.
+        self.num_out_embs = self.N_ID + self.N_SFX
         if not hasattr(self, 'num_nonface_in_id_vecs') and hasattr(self, 'N_ID'):
             self.num_nonface_in_id_vecs = self.N_ID
         if not hasattr(self, 'dtype'):
+            self.dtype = torch.float16
+            if not self.placeholder_is_bg:
+                self.prompt2token_proj.to(dtype=self.dtype)
+            else:
+                self.prompt_translator.half()
+        if not hasattr(self, 'num_ca_layers'):
+            self.num_ca_layers = 16
         if self.placeholder_is_bg:
             if not hasattr(self, 'pos_embs') or self.pos_embs is None:
                                             num_static_img_suffix_embs=self.N_SFX,
                                             img_prompt_dim=self.output_dim)
+            if not hasattr(self, 'use_layerwise_proj'):
+                self.use_layerwise_proj = False
+            if not hasattr(self, 'layerwise_proj'):
+                if self.use_layerwise_proj:
+                    self.layerwise_proj = LayerwiseMLPProjWithSkip(self.output_dim, dim_mult=2)
+                else:
+                    self.layerwise_proj = nn.Identity()
     def __repr__(self):
         type_sig = 'subj' if not self.placeholder_is_bg else 'bg'

adaface/unet_teachers.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import numpy as np
-import pytorch_lightning as pl
 from diffusers import UNet2DConditionModel
 from adaface.util import UNetEnsemble, create_consistentid_pipeline
 from diffusers import UNet2DConditionModel
@@ -12,9 +12,9 @@ def create_unet_teacher(teacher_type, device='cpu', **kwargs):
         teacher_type = teacher_type[0]
     if teacher_type == "arc2face":
-        return Arc2FaceTeacher(**kwargs)
     elif teacher_type == "unet_ensemble":
-        # unet, extra_unet_dirpaths and unet_weights are passed in kwargs.
         # Even if we distill from unet_ensemble, we still need to load arc2face for generating
         # arc2face embeddings.
         # The first (optional) ctor param of UNetEnsembleTeacher is an instantiated unet,
@@ -22,20 +22,24 @@ def create_unet_teacher(teacher_type, device='cpu', **kwargs):
         # However, since the __call__ method of the ddpm unet takes different formats of params,
         # for simplicity, we still use the diffusers unet.
         # unet_teacher is put on CPU first, then moved to GPU when DDPM is moved to GPU.
-        return UNetEnsembleTeacher(device=device, **kwargs)
     elif teacher_type == "consistentID":
-        return ConsistentIDTeacher(**kwargs)
     elif teacher_type == "simple_unet":
-        return SimpleUNetTeacher(**kwargs)
     # Since we've dereferenced the list if it has only one element,
     # this holding implies the list has more than one element. Therefore it's UNetEnsembleTeacher.
     elif isinstance(teacher_type, (tuple, list, ListConfig)):
         # teacher_type is a list of teacher types. So it's UNetEnsembleTeacher.
-        return UNetEnsembleTeacher(unet_types=teacher_type, device=device, **kwargs)
     else:
         raise NotImplementedError(f"Teacher type {teacher_type} not implemented.")
-class UNetTeacher(pl.LightningModule):
     def __init__(self, **kwargs):
         super().__init__()
         self.name = None
@@ -56,9 +60,10 @@ class UNetTeacher(pl.LightningModule):
     # to be initialized, which will unnecessarily complicate the code.
     # noise: the initial noise for the first iteration.
     # t: the initial t. We will sample additional (num_denoising_steps - 1) smaller t.
-    # uses_same_t: when sampling t, use the same t for all instances.
-    def forward(self, ddpm_model, x_start, noise, t, teacher_context,
-                num_denoising_steps=1, uses_same_t=False):
         assert num_denoising_steps <= 10
         if self.p_uses_cfg > 0:
@@ -71,27 +76,22 @@ class UNetTeacher(pl.LightningModule):
             if self.uses_cfg:
                 print(f"Teacher samples CFG scale {self.cfg_scale:.1f}.")
             else:
                 self.cfg_scale = 1
                 print("Teacher does not use CFG.")
-                # If p_uses_cfg > 0, we always pass both pos_context and neg_context to the teacher.
-                # But the neg_context is only used when self.uses_cfg is True and cfg_scale > 1.
-                # So we manually split the teacher_context into pos_context and neg_context, and only keep pos_context.
-                if self.name == 'unet_ensemble':
-                    teacher_pos_contexts = []
-                    # teacher_context is a list of teacher contexts.
-                    for teacher_context_i in teacher_context:
-                        pos_context, neg_context = torch.chunk(teacher_context_i, 2, dim=0)
-                        if pos_context.shape[0] != x_start.shape[0]:
-                            breakpoint()
-                        teacher_pos_contexts.append(pos_context)
-                    teacher_context = teacher_pos_contexts
-                else:
-                    pos_context, neg_context = torch.chunk(teacher_context, 2, dim=0)
-                    if pos_context.shape[0] != x_start.shape[0]:
-                        breakpoint()
-                    teacher_context = pos_context
         else:
             # p_uses_cfg = 0. Never use CFG.
             self.uses_cfg = False
@@ -102,15 +102,21 @@ class UNetTeacher(pl.LightningModule):
             # in case someday we want to switch from CFG to non-CFG during runtime.
             self.cfg_scale = 1
         if self.name == 'unet_ensemble':
             # teacher_context is a list of teacher contexts.
             for teacher_context_i in teacher_context:
-                if teacher_context_i.shape[0] != x_start.shape[0] * (1 + self.uses_cfg):
                     breakpoint()
         else:
-            if teacher_context.shape[0] != x_start.shape[0] * (1 + self.uses_cfg):
                 breakpoint()
         # Initially, x_starts only contains the original x_start.
         x_starts    = [ x_start ]
         noises      = [ noise ]
@@ -125,24 +131,35 @@ class UNetTeacher(pl.LightningModule):
                 # sqrt_alphas_cumprod[t] * x_start + sqrt_one_minus_alphas_cumprod[t] * noise
                 x_noisy = ddpm_model.q_sample(x_start, t, noise)
-                if self.uses_cfg:
                     x_noisy2 = x_noisy.repeat(2, 1, 1, 1)
                     t2       = t.repeat(2)
                 else:
                     x_noisy2 = x_noisy
-                    t2 = t
                 # If do_arc2face_distill, then pos_context is [BS=6, 21, 768].
                 noise_pred = self.unet(sample=x_noisy2, timestep=t2, encoder_hidden_states=teacher_context,
                                        return_dict=False)[0]
                 if self.uses_cfg and self.cfg_scale > 1:
-                    pos_noise_pred, neg_noise_pred = torch.chunk(noise_pred, 2, dim=0)
                     noise_pred = pos_noise_pred * self.cfg_scale - neg_noise_pred * (self.cfg_scale - 1)
-                # sqrt_recip_alphas_cumprod[t] * x_t - sqrt_recipm1_alphas_cumprod[t] * noise
-                pred_x0 = ddpm_model.predict_start_from_noise(x_noisy, t, noise_pred)
                 noise_preds.append(noise_pred)
                 # The predicted x0 is used as the x_start for the next denoising step.
                 x_starts.append(pred_x0)
@@ -157,20 +174,43 @@ class UNetTeacher(pl.LightningModule):
                     # of the current timestep.
                     t_lb = t * np.power(0.5, np.power(num_denoising_steps - 1, -0.3))
                     t_ub = t * np.power(0.7, np.power(num_denoising_steps - 1, -0.3))
                     earlier_timesteps = (t_ub - t_lb) * relative_ts + t_lb
                     earlier_timesteps = earlier_timesteps.long()
-                    if uses_same_t:
-                        # If uses_same_t, we use the same earlier_timesteps for all instances.
                         earlier_timesteps = earlier_timesteps[0].repeat(x_start.shape[0])
                     # earlier_timesteps = ts[i+1] < ts[i].
                     ts.append(earlier_timesteps)
-                    noise = torch.randn_like(pred_x0)
                     noises.append(noise)
         return noise_preds, x_starts, noises, ts
 class Arc2FaceTeacher(UNetTeacher):
     def __init__(self, **kwargs):
@@ -185,11 +225,11 @@ class Arc2FaceTeacher(UNetTeacher):
         self.cfg_scale_range = [1, 1]
 class UNetEnsembleTeacher(UNetTeacher):
-    # unet_weights are not model weights, but scalar weights for individual unets.
-    def __init__(self, unets, unet_types, extra_unet_dirpaths, unet_weights=None, device='cuda', **kwargs):
         super().__init__(**kwargs)
         self.name = "unet_ensemble"
-        self.unet = UNetEnsemble(unets, unet_types, extra_unet_dirpaths, unet_weights, device)
 class ConsistentIDTeacher(UNetTeacher):
     def __init__(self, base_model_path="models/sd15-dste8-vae.safetensors", **kwargs):
@@ -199,12 +239,9 @@ class ConsistentIDTeacher(UNetTeacher):
         # In contrast to Arc2FaceTeacher or UNetEnsembleTeacher, ConsistentIDPipeline is not a torch.nn.Module.
         # We couldn't initialize the ConsistentIDPipeline to CPU first and wait it to be automatically moved to GPU.
         # Instead, we have to initialize it to GPU directly.
-        pipe = create_consistentid_pipeline(base_model_path)
-        # Compatible with the UNetTeacher interface.
-        self.unet = pipe.unet
-        # Release VAE and text_encoder to save memory. UNet is still needed for denoising
         # (the unet is implemented in diffusers in fp16, so probably faster than the LDM unet).
-        pipe.release_components(["vae", "text_encoder"])
 # We use the default cfg_scale_range=[1.3, 2] for SimpleUNetTeacher.
 # Note p_uses_cfg=0.5 will also be passed in in kwargs.

 import torch
+from torch import nn
 import numpy as np
 from diffusers import UNet2DConditionModel
 from adaface.util import UNetEnsemble, create_consistentid_pipeline
 from diffusers import UNet2DConditionModel
         teacher_type = teacher_type[0]
     if teacher_type == "arc2face":
+        teacher = Arc2FaceTeacher(**kwargs)
     elif teacher_type == "unet_ensemble":
+        # unet, extra_unet_dirpaths and unet_weights_in_ensemble are passed in kwargs.
         # Even if we distill from unet_ensemble, we still need to load arc2face for generating
         # arc2face embeddings.
         # The first (optional) ctor param of UNetEnsembleTeacher is an instantiated unet,
         # However, since the __call__ method of the ddpm unet takes different formats of params,
         # for simplicity, we still use the diffusers unet.
         # unet_teacher is put on CPU first, then moved to GPU when DDPM is moved to GPU.
+        teacher = UNetEnsembleTeacher(device=device, **kwargs)
     elif teacher_type == "consistentID":
+        teacher = ConsistentIDTeacher(**kwargs)
     elif teacher_type == "simple_unet":
+        teacher = SimpleUNetTeacher(**kwargs)
     # Since we've dereferenced the list if it has only one element,
     # this holding implies the list has more than one element. Therefore it's UNetEnsembleTeacher.
     elif isinstance(teacher_type, (tuple, list, ListConfig)):
         # teacher_type is a list of teacher types. So it's UNetEnsembleTeacher.
+        teacher = UNetEnsembleTeacher(unet_types=teacher_type, device=device, **kwargs)
     else:
         raise NotImplementedError(f"Teacher type {teacher_type} not implemented.")
+    for param in teacher.parameters():
+        param.requires_grad = False
+    return teacher
+class UNetTeacher(nn.Module):
     def __init__(self, **kwargs):
         super().__init__()
         self.name = None
     # to be initialized, which will unnecessarily complicate the code.
     # noise: the initial noise for the first iteration.
     # t: the initial t. We will sample additional (num_denoising_steps - 1) smaller t.
+    # same_t_noise_across_instances: when sampling t and noise, use the same t and noise for all instances.
+    def forward(self, ddpm_model, x_start, noise, t, teacher_context, negative_context=None,
+                num_denoising_steps=1, same_t_noise_across_instances=False,
+                global_t_lb=0, global_t_ub=1000):
         assert num_denoising_steps <= 10
         if self.p_uses_cfg > 0:
             if self.uses_cfg:
                 print(f"Teacher samples CFG scale {self.cfg_scale:.1f}.")
+                if negative_context is not None:
+                    negative_context = negative_context[:1].repeat(x_start.shape[0], 1, 1)
+                # if negative_context is None, then teacher_context is a combination of
+                # (one or multiple if unet_ensemble) pos_context and neg_context.
+                # If negative_context is not None, then teacher_context is only pos_context.
             else:
                 self.cfg_scale = 1
                 print("Teacher does not use CFG.")
+                # If negative_context is None, then teacher_context is a combination of
+                # (one or multiple if unet_ensemble) pos_context and neg_context.
+                # Since not uses_cfg, we only need pos_context.
+                # If negative_context is not None, then teacher_context is only pos_context.
+                if negative_context is None:
+                    teacher_context = self.extract_pos_context(teacher_context, x_start.shape[0])
         else:
             # p_uses_cfg = 0. Never use CFG.
             self.uses_cfg = False
             # in case someday we want to switch from CFG to non-CFG during runtime.
             self.cfg_scale = 1
+        is_context_doubled = 2 if (self.uses_cfg and negative_context is None) else 1
         if self.name == 'unet_ensemble':
             # teacher_context is a list of teacher contexts.
             for teacher_context_i in teacher_context:
+                if teacher_context_i.shape[0] != x_start.shape[0] * is_context_doubled:
                     breakpoint()
         else:
+            if teacher_context.shape[0] != x_start.shape[0] * is_context_doubled:
                 breakpoint()
+        if same_t_noise_across_instances:
+            # If same_t_noise_across_instances, we use the same t and noise for all instances.
+            t = t[0].repeat(x_start.shape[0])
+            noise = noise[:1].repeat(x_start.shape[0], 1, 1, 1)
         # Initially, x_starts only contains the original x_start.
         x_starts    = [ x_start ]
         noises      = [ noise ]
                 # sqrt_alphas_cumprod[t] * x_start + sqrt_one_minus_alphas_cumprod[t] * noise
                 x_noisy = ddpm_model.q_sample(x_start, t, noise)
+                if self.uses_cfg and self.cfg_scale > 1 and negative_context is None:
                     x_noisy2 = x_noisy.repeat(2, 1, 1, 1)
                     t2       = t.repeat(2)
                 else:
                     x_noisy2 = x_noisy
+                    t2       = t
                 # If do_arc2face_distill, then pos_context is [BS=6, 21, 768].
                 noise_pred = self.unet(sample=x_noisy2, timestep=t2, encoder_hidden_states=teacher_context,
                                        return_dict=False)[0]
                 if self.uses_cfg and self.cfg_scale > 1:
+                    if negative_context is None:
+                        pos_noise_pred, neg_noise_pred = torch.chunk(noise_pred, 2, dim=0)
+                    else:
+                        # If negative_context is not None, then teacher_context is only pos_context.
+                        pos_noise_pred = noise_pred
+                        with torch.no_grad():
+                            if self.name == 'unet_ensemble':
+                                neg_noise_pred = self.unet.unets[0](sample=x_noisy, timestep=t,
+                                                                    encoder_hidden_states=negative_context, return_dict=False)[0]
+                            else:
+                                neg_noise_pred = self.unet(sample=x_noisy, timestep=t,
+                                                           encoder_hidden_states=negative_context, return_dict=False)[0]
                     noise_pred = pos_noise_pred * self.cfg_scale - neg_noise_pred * (self.cfg_scale - 1)
                 noise_preds.append(noise_pred)
+                # sqrt_recip_alphas_cumprod[t] * x_t - sqrt_recipm1_alphas_cumprod[t] * noise
+                pred_x0 = ddpm_model.predict_start_from_noise(x_noisy, t, noise_pred)
                 # The predicted x0 is used as the x_start for the next denoising step.
                 x_starts.append(pred_x0)
                     # of the current timestep.
                     t_lb = t * np.power(0.5, np.power(num_denoising_steps - 1, -0.3))
                     t_ub = t * np.power(0.7, np.power(num_denoising_steps - 1, -0.3))
+                    t_lb = torch.clamp(t_lb, min=global_t_lb)
+                    t_ub = torch.clamp(t_ub, max=global_t_ub)
                     earlier_timesteps = (t_ub - t_lb) * relative_ts + t_lb
                     earlier_timesteps = earlier_timesteps.long()
+                    noise = torch.randn_like(pred_x0)
+                    if same_t_noise_across_instances:
+                        # If same_t_noise_across_instances, we use the same earlier_timesteps and noise for all instances.
                         earlier_timesteps = earlier_timesteps[0].repeat(x_start.shape[0])
+                        noise = noise[:1].repeat(x_start.shape[0], 1, 1, 1)
                     # earlier_timesteps = ts[i+1] < ts[i].
                     ts.append(earlier_timesteps)
                     noises.append(noise)
         return noise_preds, x_starts, noises, ts
+    def extract_pos_context(self, teacher_context, BS):
+        # If p_uses_cfg > 0, we always pass both pos_context and neg_context to the teacher.
+        # But the neg_context is only used when self.uses_cfg is True and cfg_scale > 1.
+        # So we manually split the teacher_context into pos_context and neg_context, and only keep pos_context.
+        if self.name == 'unet_ensemble':
+            teacher_pos_contexts = []
+            # teacher_context is a list of teacher contexts.
+            for teacher_context_i in teacher_context:
+                pos_context, neg_context = torch.chunk(teacher_context_i, 2, dim=0)
+                if pos_context.shape[0] != BS:
+                    breakpoint()
+                teacher_pos_contexts.append(pos_context)
+            teacher_context = teacher_pos_contexts
+        else:
+            pos_context, neg_context = torch.chunk(teacher_context, 2, dim=0)
+            if pos_context.shape[0] != BS:
+                breakpoint()
+            teacher_context = pos_context
+        return teacher_context
 class Arc2FaceTeacher(UNetTeacher):
     def __init__(self, **kwargs):
         self.cfg_scale_range = [1, 1]
 class UNetEnsembleTeacher(UNetTeacher):
+    # unet_weights_in_ensemble are not model weights, but scalar weights for individual unets.
+    def __init__(self, unets, unet_types, extra_unet_dirpaths, unet_weights_in_ensemble=None, device='cuda', **kwargs):
         super().__init__(**kwargs)
         self.name = "unet_ensemble"
+        self.unet = UNetEnsemble(unets, unet_types, extra_unet_dirpaths, unet_weights_in_ensemble, device)
 class ConsistentIDTeacher(UNetTeacher):
     def __init__(self, base_model_path="models/sd15-dste8-vae.safetensors", **kwargs):
         # In contrast to Arc2FaceTeacher or UNetEnsembleTeacher, ConsistentIDPipeline is not a torch.nn.Module.
         # We couldn't initialize the ConsistentIDPipeline to CPU first and wait it to be automatically moved to GPU.
         # Instead, we have to initialize it to GPU directly.
+        # Release VAE and text_encoder to save memory. UNet is needed for denoising
         # (the unet is implemented in diffusers in fp16, so probably faster than the LDM unet).
+        self.unet = create_consistentid_pipeline(base_model_path, unet_only=True)
 # We use the default cfg_scale_range=[1.3, 2] for SimpleUNetTeacher.
 # Note p_uses_cfg=0.5 will also be passed in in kwargs.

adaface/util.py CHANGED Viewed

@@ -57,7 +57,7 @@ def perturb_np_array(np_array, perturb_std, perturb_std_is_relative=True, std_di
     ts = perturb_tensor(ts, perturb_std, perturb_std_is_relative, std_dim=std_dim)
     return ts.numpy().astype(np_array.dtype)
-def calc_stats(emb_name, embeddings, mean_dim=0):
     print("%s:" %emb_name)
     repeat_count = [1] * embeddings.ndim
     repeat_count[mean_dim] = embeddings.shape[mean_dim]
@@ -153,13 +153,14 @@ def pad_image_obj_to_square(image_obj, new_size=-1):
 class UNetEnsemble(nn.Module):
     # The first unet is the unet already loaded in a pipeline.
-    def __init__(self, unets, unet_types, extra_unet_dirpaths, unet_weights=None, device='cuda', torch_dtype=torch.float16):
         super().__init__()
-        self.unets = nn.ModuleList()
         if unets is not None:
-            self.unets += [ unet.to(device) for unet in unets ]
         if unet_types is not None:
             for unet_type in unet_types:
                 if unet_type == "arc2face":
@@ -169,25 +170,27 @@ class UNetEnsemble(nn.Module):
                     unet = create_consistentid_pipeline(unet_only=True)
                 else:
                     breakpoint()
-                self.unets.append(unet.to(device=device))
         if extra_unet_dirpaths is not None:
             for unet_path in extra_unet_dirpaths:
                 unet = UNet2DConditionModel.from_pretrained(unet_path, torch_dtype=torch_dtype)
-                self.unets.append(unet.to(device=device))
-        if unet_weights is None:
-            unet_weights = [1.] * len(self.unets)
-        elif len(self.unets) < len(unet_weights):
-            unet_weights = unet_weights[:len(self.unets)]
-        elif len(self.unets) > len(unet_weights):
             breakpoint()
-        unet_weights = torch.tensor(unet_weights, dtype=torch_dtype)
-        unet_weights = unet_weights / unet_weights.sum()
-        self.unet_weights = nn.Parameter(unet_weights, requires_grad=False)
-        print(f"UNetEnsemble: {len(self.unets)} UNets loaded with weights: {self.unet_weights.data.cpu().numpy()}")
         # Set these fields to be compatible with diffusers.
         self.dtype  = self.unets[0].dtype
         self.device = self.unets[0].device
@@ -215,8 +218,8 @@ class UNetEnsemble(nn.Module):
             samples.append(sample)
         samples = torch.stack(samples, dim=0)
-        unet_weights = self.unet_weights.reshape(-1, *([1] * (samples.ndim - 1)))
-        sample = (samples * unet_weights).sum(dim=0)
         if not return_dict:
             return (sample,)

     ts = perturb_tensor(ts, perturb_std, perturb_std_is_relative, std_dim=std_dim)
     return ts.numpy().astype(np_array.dtype)
+def calc_stats(emb_name, embeddings, mean_dim=-1):
     print("%s:" %emb_name)
     repeat_count = [1] * embeddings.ndim
     repeat_count[mean_dim] = embeddings.shape[mean_dim]
 class UNetEnsemble(nn.Module):
     # The first unet is the unet already loaded in a pipeline.
+    def __init__(self, unets, unet_types, extra_unet_dirpaths, unet_weights_in_ensemble=None, device='cuda', torch_dtype=torch.float16):
         super().__init__()
         if unets is not None:
+            unets = [ unet.to(device) for unet in unets ]
+        else:
+            unets = []
         if unet_types is not None:
             for unet_type in unet_types:
                 if unet_type == "arc2face":
                     unet = create_consistentid_pipeline(unet_only=True)
                 else:
                     breakpoint()
+                unets.append(unet.to(device=device))
         if extra_unet_dirpaths is not None:
             for unet_path in extra_unet_dirpaths:
                 unet = UNet2DConditionModel.from_pretrained(unet_path, torch_dtype=torch_dtype)
+                unets.append(unet.to(device=device))
+        if unet_weights_in_ensemble is None:
+            unet_weights_in_ensemble = [1.] * len(unets)
+        elif len(unets) < len(unet_weights_in_ensemble):
+            unet_weights_in_ensemble = unet_weights_in_ensemble[:len(unets)]
+        elif len(unets) > len(unet_weights_in_ensemble):
             breakpoint()
+        unet_weights_in_ensemble = torch.tensor(unet_weights_in_ensemble, dtype=torch_dtype)
+        unet_weights_in_ensemble = unet_weights_in_ensemble / unet_weights_in_ensemble.sum()
+        self.unets = nn.ModuleList(unets)
+        # Put the weights in a Parameter so that they will be moved to the same device as the model.
+        self.unet_weights_in_ensemble = nn.Parameter(unet_weights_in_ensemble, requires_grad=False)
+        print(f"UNetEnsemble: {len(self.unets)} UNets loaded with weights: {self.unet_weights_in_ensemble.data.cpu().numpy()}")
         # Set these fields to be compatible with diffusers.
         self.dtype  = self.unets[0].dtype
         self.device = self.unets[0].device
             samples.append(sample)
         samples = torch.stack(samples, dim=0)
+        unet_weights_in_ensemble = self.unet_weights_in_ensemble.reshape(-1, *([1] * (samples.ndim - 1)))
+        sample = (samples * unet_weights_in_ensemble).sum(dim=0)
         if not return_dict:
             return (sample,)

animatediff/sd/.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-v1-5-pruned-emaonly.ckpt filter=lfs diff=lfs merge=lfs -text
-v1-5-pruned.ckpt filter=lfs diff=lfs merge=lfs -text

animatediff/sd/feature_extractor/preprocessor_config.json DELETED Viewed

@@ -1,20 +0,0 @@
-{
-  "crop_size": 224,
-  "do_center_crop": true,
-  "do_convert_rgb": true,
-  "do_normalize": true,
-  "do_resize": true,
-  "feature_extractor_type": "CLIPFeatureExtractor",
-  "image_mean": [
-    0.48145466,
-    0.4578275,
-    0.40821073
-  ],
-  "image_std": [
-    0.26862954,
-    0.26130258,
-    0.27577711
-  ],
-  "resample": 3,
-  "size": 224
-}

animatediff/sd/model_index.json DELETED Viewed

@@ -1,32 +0,0 @@
-{
-  "_class_name": "StableDiffusionPipeline",
-  "_diffusers_version": "0.6.0",
-  "feature_extractor": [
-    "transformers",
-    "CLIPImageProcessor"
-  ],
-  "safety_checker": [
-    "stable_diffusion",
-    "StableDiffusionSafetyChecker"
-  ],
-  "scheduler": [
-    "diffusers",
-    "PNDMScheduler"
-  ],
-  "text_encoder": [
-    "transformers",
-    "CLIPTextModel"
-  ],
-  "tokenizer": [
-    "transformers",
-    "CLIPTokenizer"
-  ],
-  "unet": [
-    "diffusers",
-    "UNet2DConditionModel"
-  ],
-  "vae": [
-    "diffusers",
-    "AutoencoderKL"
-  ]
-}

animatediff/sd/safety_checker/config.json DELETED Viewed

@@ -1,175 +0,0 @@
-{
-  "_commit_hash": "4bb648a606ef040e7685bde262611766a5fdd67b",
-  "_name_or_path": "CompVis/stable-diffusion-safety-checker",
-  "architectures": [
-    "StableDiffusionSafetyChecker"
-  ],
-  "initializer_factor": 1.0,
-  "logit_scale_init_value": 2.6592,
-  "model_type": "clip",
-  "projection_dim": 768,
-  "text_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "bos_token_id": 0,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "dropout": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "quick_gelu",
-    "hidden_size": 768,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 3072,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "max_position_embeddings": 77,
-    "min_length": 0,
-    "model_type": "clip_text_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 12,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_hidden_layers": 12,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": 1,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.22.0.dev0",
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "vocab_size": 49408
-  },
-  "text_config_dict": {
-    "hidden_size": 768,
-    "intermediate_size": 3072,
-    "num_attention_heads": 12,
-    "num_hidden_layers": 12
-  },
-  "torch_dtype": "float32",
-  "transformers_version": null,
-  "vision_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "dropout": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "quick_gelu",
-    "hidden_size": 1024,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 224,
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 4096,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "clip_vision_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 16,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
-    "num_hidden_layers": 24,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 14,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.22.0.dev0",
-    "typical_p": 1.0,
-    "use_bfloat16": false
-  },
-  "vision_config_dict": {
-    "hidden_size": 1024,
-    "intermediate_size": 4096,
-    "num_attention_heads": 16,
-    "num_hidden_layers": 24,
-    "patch_size": 14
-  }
-}

animatediff/sd/scheduler/scheduler_config.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "_class_name": "PNDMScheduler",
-  "_diffusers_version": "0.6.0",
-  "beta_end": 0.012,
-  "beta_schedule": "scaled_linear",
-  "beta_start": 0.00085,
-  "num_train_timesteps": 1000,
-  "set_alpha_to_one": false,
-  "skip_prk_steps": true,
-  "steps_offset": 1,
-  "trained_betas": null,
-  "clip_sample": false
-}

animatediff/sd/text_encoder/config.json DELETED Viewed

@@ -1,25 +0,0 @@
-{
-  "_name_or_path": "openai/clip-vit-large-patch14",
-  "architectures": [
-    "CLIPTextModel"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 0,
-  "dropout": 0.0,
-  "eos_token_id": 2,
-  "hidden_act": "quick_gelu",
-  "hidden_size": 768,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 77,
-  "model_type": "clip_text_model",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 1,
-  "projection_dim": 768,
-  "torch_dtype": "float32",
-  "transformers_version": "4.22.0.dev0",
-  "vocab_size": 49408
-}

animatediff/sd/tokenizer/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

animatediff/sd/tokenizer/special_tokens_map.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "bos_token": {
-    "content": "<|startoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": "<|endoftext|>",
-  "unk_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

animatediff/sd/tokenizer/tokenizer_config.json DELETED Viewed

@@ -1,34 +0,0 @@
-{
-  "add_prefix_space": false,
-  "bos_token": {
-    "__type": "AddedToken",
-    "content": "<|startoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "do_lower_case": true,
-  "eos_token": {
-    "__type": "AddedToken",
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
-  "errors": "replace",
-  "model_max_length": 77,
-  "name_or_path": "openai/clip-vit-large-patch14",
-  "pad_token": "<|endoftext|>",
-  "special_tokens_map_file": "./special_tokens_map.json",
-  "tokenizer_class": "CLIPTokenizer",
-  "unk_token": {
-    "__type": "AddedToken",
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  }
-}

animatediff/sd/tokenizer/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

animatediff/sd/unet/config.json DELETED Viewed

@@ -1,36 +0,0 @@
-{
-  "_class_name": "UNet2DConditionModel",
-  "_diffusers_version": "0.6.0",
-  "act_fn": "silu",
-  "attention_head_dim": 8,
-  "block_out_channels": [
-    320,
-    640,
-    1280,
-    1280
-  ],
-  "center_input_sample": false,
-  "cross_attention_dim": 768,
-  "down_block_types": [
-    "CrossAttnDownBlock2D",
-    "CrossAttnDownBlock2D",
-    "CrossAttnDownBlock2D",
-    "DownBlock2D"
-  ],
-  "downsample_padding": 1,
-  "flip_sin_to_cos": true,
-  "freq_shift": 0,
-  "in_channels": 4,
-  "layers_per_block": 2,
-  "mid_block_scale_factor": 1,
-  "norm_eps": 1e-05,
-  "norm_num_groups": 32,
-  "out_channels": 4,
-  "sample_size": 64,
-  "up_block_types": [
-    "UpBlock2D",
-    "CrossAttnUpBlock2D",
-    "CrossAttnUpBlock2D",
-    "CrossAttnUpBlock2D"
-  ]
-}

animatediff/sd/v1-inference.yaml DELETED Viewed

@@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 10000 ]
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1. ]
-        f_min: [ 1. ]
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

animatediff/sd/vae/config.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-  "_class_name": "AutoencoderKL",
-  "_diffusers_version": "0.6.0",
-  "act_fn": "silu",
-  "block_out_channels": [
-    128,
-    256,
-    512,
-    512
-  ],
-  "down_block_types": [
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D"
-  ],
-  "in_channels": 3,
-  "latent_channels": 4,
-  "layers_per_block": 2,
-  "norm_num_groups": 32,
-  "out_channels": 3,
-  "sample_size": 512,
-  "up_block_types": [
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D"
-  ]
-}

animatediff/utils/convert_from_ckpt.py CHANGED Viewed

@@ -714,7 +714,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config):
 def convert_ldm_clip_checkpoint(checkpoint, dtype=torch.float16):
-    text_model = CLIPTextModel.from_pretrained("animatediff/sd/text_encoder", torch_dtype=dtype)
     keys = list(checkpoint.keys())
     text_model_dict = {}

 def convert_ldm_clip_checkpoint(checkpoint, dtype=torch.float16):
+    text_model = CLIPTextModel.from_pretrained("models/animatediff/sd/text_encoder", torch_dtype=dtype)
     keys = list(checkpoint.keys())
     text_model_dict = {}

app.py CHANGED Viewed

@@ -24,13 +24,11 @@ parser = argparse.ArgumentParser()
 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
 parser.add_argument('--adaface_ckpt_path', type=str,
-                    default='models/adaface/VGGface2_HQ_masks2024-10-14T16-09-24_zero3-ada-3500.pt')
-parser.add_argument('--model_style_type', type=str, default='realistic',
                     choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
-parser.add_argument("--guidance_scale", type=float, default=6.0,
                     help="The guidance scale for the diffusion model. Default: 8.0")
-parser.add_argument("--do_neg_id_prompt_weight", type=float, default=0,
-                    help="The weight of added ID prompt embeddings into the negative prompt. Default: 0, disabled.")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")
@@ -41,20 +39,38 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
         seed = random.randint(0, MAX_SEED)
     return seed
 # model = load_model()
 # This FaceAnalysis is just to crop the face areas from the uploaded images,
 # and is independent of the adaface FaceAnalysis apps.
-app = FaceAnalysis(name="buffalo_l", root='models/insightface', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
 app.prepare(ctx_id=0, det_size=(320, 320))
-device = "cuda" if args.gpu is None else f"cuda:{args.gpu}"
 global adaface, id_animator
-base_model_path = model_style_type2base_model_path[args.model_style_type]
 id_animator = load_model(model_style_type=args.model_style_type, device='cpu')
-adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=base_model_path,
                          adaface_encoder_types=args.adaface_encoder_types,
-                         adaface_ckpt_paths=[args.adaface_ckpt_path], device='cpu')
 basedir = os.getcwd()
 savedir = os.path.join(basedir,'samples')
@@ -79,7 +95,7 @@ def get_clicked_image(data: gr.SelectData):
     return data.index
 @spaces.GPU
-def gen_init_images(uploaded_image_paths, prompt, guidance_scale, do_neg_id_prompt_weight, out_image_count=4):
     if uploaded_image_paths is None:
         print("No image uploaded")
         return None, None, None
@@ -92,9 +108,11 @@ def gen_init_images(uploaded_image_paths, prompt, guidance_scale, do_neg_id_prom
     # [('/tmp/gradio/249981e66a7c665aaaf1c7eaeb24949af4366c88/jensen huang.jpg', None)]
     # Extract the file paths.
     uploaded_image_paths = [path[0] for path in uploaded_image_paths]
-    adaface_subj_embs = \
-        adaface.prepare_adaface_embeddings(image_paths=uploaded_image_paths, face_id_embs=None,
-                                           update_text_encoder=True)
     if adaface_subj_embs is None:
         raise gr.Error(f"Failed to detect any faces! Please try with other images")
@@ -102,20 +120,22 @@ def gen_init_images(uploaded_image_paths, prompt, guidance_scale, do_neg_id_prom
     # Generate two images each time for the user to select from.
     noise = torch.randn(out_image_count, 3, 512, 512)
-    enhance_face = True
-    if enhance_face and "face portrait" not in prompt:
         if "portrait" in prompt:
             # Enhance the face features by replacing "portrait" with "face portrait".
             prompt = prompt.replace("portrait", "face portrait")
         else:
             prompt = "face portrait, " + prompt
     # samples: A list of PIL Image instances.
     with torch.no_grad():
         samples = adaface(noise, prompt, placeholder_tokens_pos='append',
                           guidance_scale=guidance_scale,
-                          do_neg_id_prompt_weight=do_neg_id_prompt_weight,
-                          out_image_count=out_image_count, verbose=True)
     face_paths = []
     for sample in samples:
@@ -131,9 +151,9 @@ def gen_init_images(uploaded_image_paths, prompt, guidance_scale, do_neg_id_prom
 @spaces.GPU(duration=90)
 def generate_video(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
                    init_image_strength, init_image_final_weight,
-                   prompt, negative_prompt, num_steps, video_length, guidance_scale, do_neg_id_prompt_weight,
                    seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
-                   is_adaface_enabled, adaface_ckpt_path, adaface_power_scale,
                    id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
     global adaface, id_animator
@@ -143,10 +163,17 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
     if prompt is None:
         prompt = ""
-    prompt = prompt + " 8k uhd, high quality"
-    if " shot" not in prompt:
-        prompt = prompt + ", medium shot"
     prompt_img_lists=[]
     for path in uploaded_image_paths:
         img = cv2.imread(path)
@@ -158,16 +185,11 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
         # prompt_img_lists is a list of PIL images.
         prompt_img_lists.append(load_image(face_path).resize((224,224)))
-    if adaface is None or not is_adaface_enabled:
         adaface_prompt_embeds, negative_prompt_embeds = None, None
         image_embed_cfg_scales = (1, 1)
     else:
-        if (adaface_ckpt_path is not None and adaface_ckpt_path.strip() != '') \
-          and (adaface_ckpt_path != args.adaface_ckpt_path):
-            args.adaface_ckpt_path = adaface_ckpt_path
-            # Reload the adaface model weights.
-            adaface.id2ada_prompt_encoder.load_adaface_ckpt(adaface_ckpt_path)
         with torch.no_grad():
             adaface_subj_embs = \
                 adaface.prepare_adaface_embeddings(image_paths=uploaded_image_paths, face_id_embs=None,
@@ -176,9 +198,10 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
             # adaface_prompt_embeds: [1, 77, 768].
             adaface_prompt_embeds, negative_prompt_embeds, _, _ = \
                 adaface.encode_prompt(prompt, placeholder_tokens_pos='append',
-                                      do_neg_id_prompt_weight=do_neg_id_prompt_weight,
                                       verbose=True)
         image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
     # init_img_file_paths is a list of image paths. If not chose, init_img_file_paths is None.
@@ -198,8 +221,8 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
                                   prompt                = prompt,
                                   negative_prompt       = negative_prompt,
                                   adaface_prompt_embeds = (adaface_prompt_embeds, negative_prompt_embeds),
-                                  # adaface_power_scale is not so useful, and when it's set >= 2, weird artifacts appear.
-                                  # Here it's limited to 0.7~1.3.
                                   adaface_power_scale   = adaface_power_scale,
                                   num_inference_steps   = num_steps,
                                   id_animator_anneal_steps  = id_animator_anneal_steps,
@@ -216,7 +239,7 @@ def generate_video(image_container, uploaded_image_paths, init_img_file_paths, i
     save_videos_grid(sample, save_sample_path)
     return save_sample_path
-def check_prompt_and_model_type(prompt, model_style_type):
     global adaface, id_animator
     model_style_type = model_style_type.lower()
@@ -236,21 +259,20 @@ def check_prompt_and_model_type(prompt, model_style_type):
 with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
     gr.Markdown(
         """
-        # AdaFace-Animate: Zero-Shot Subject-Driven Video Generation for Humans
         """
     )
     gr.Markdown(
         """
-<b>Official demo</b> for our working paper <b>AdaFace: A Versatile Face Encoder for Zero-Shot Diffusion Model Personalization</b>.<br>
-❗️**What's New**❗️
-- Support switching between two model styles: **Realistic** and **Anime**.
-- If you just changed the model style, the first image/video generation will take extra 20~30 seconds for loading new model weight.
 ❗️**Tips**❗️
 - You can upload one or more subject images for generating ID-specific video.
-- If the face dominates the video frames, try increasing the 'Weight of ID prompt in the negative prompt'.
-- If the face loses focus, try increasing the guidance scale.
 - If the motion is weird, e.g., the prompt is "... running", try increasing the number of sampling steps.
 - Usage explanations and demos: [Readme](https://huggingface.co/spaces/adaface-neurips/adaface-animate/blob/main/README2.md).
 - AdaFace Text-to-Image: <a href="https://huggingface.co/spaces/adaface-neurips/adaface" style="display: inline-flex; align-items: center;">
@@ -258,8 +280,6 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
   <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow" alt="Hugging Face Spaces" style="margin-left: 5px;">
   </a>
-**TODO:**
-- ControlNet integration.
         """
     )
@@ -270,6 +290,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                         file_types=["image"],
                         file_count="multiple"
                     )
             image_container = gr.Image(label="image container", sources="upload", type="numpy", height=256, visible=False)
             uploaded_files_gallery = gr.Gallery(label="Subject images", visible=False, columns=3, rows=2, height=300)
             with gr.Column(visible=False) as clear_button_column:
@@ -280,6 +301,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                             file_types=["image"],
                             file_count="multiple"
                     )
             init_img_container = gr.Image(label="init image container", sources="upload", type="numpy", height=256, visible=False)
             # Although there's only one image, we still use columns=3, to scale down the image size.
             # Otherwise it will occupy the full width, and the gallery won't show the whole image.
@@ -288,41 +310,47 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
             init_img_selected_idx = gr.Textbox(label="Selected init image index", value="0", visible=False)
             with gr.Column(visible=True) as init_gen_button_column:
-                gen_init = gr.Button(value="Generate 3 new init images")
             with gr.Column(visible=False) as init_clear_button_column:
                 remove_init_and_reupload = gr.ClearButton(value="Upload an old init image", components=init_img_files, size="sm")
             prompt = gr.Dropdown(label="Prompt",
-                       info="Try something like 'man/woman walking on the beach'.",
-                       value="((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
-                       allow_custom_value=True,
-                       filterable=False,
-                       choices=[
-                            "((best quality)), ((masterpiece)), ((realistic)), highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
-                            "walking on the beach, sunset, orange sky, eye level shot",
-                            "in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
-                            "dancing pose among folks in a park, waving hands",
-                            "in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
-                            "jedi wielding a lightsaber, star wars, full body view, eye level shot",
-                            "playing guitar on a boat, ocean waves",
-                            "with a passion for reading, curled up with a book in a cozy nook near a window",
-                            #"running pose in a park, full body view, eye level shot",
-                            "in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
-                       ])
             init_image_strength = gr.Slider(
                     label="Init Image Strength",
                     info="How much the init image should influence each frame. 0: no influence (scenes are more dynamic), 3: strongest influence (scenes are more static).",
                     minimum=0,
-                    maximum=1.5,
-                    step=0.25,
                     value=1,
                 )
             init_image_final_weight = gr.Slider(
-                    label="Final Weight of the Init Image",
                     info="How much the init image should influence the end of the video",
                     minimum=0,
-                    maximum=0.25,
                     step=0.025,
                     value=0.1,
                 )
@@ -331,7 +359,7 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                 label="Base Model Style Type",
                 info="Switching the base model type will take 10~20 seconds to reload the model",
                 value=args.model_style_type.capitalize(),
-                choices=["Realistic", "Anime"], #"Photorealistic"],
                 allow_custom_value=False,
                 filterable=False,
             )
@@ -344,15 +372,6 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                 value=args.guidance_scale,
             )
-            do_neg_id_prompt_weight = gr.Slider(
-                label="Weight of ID prompt in the negative prompt",
-                minimum=0.0,
-                maximum=0.9,
-                step=0.1,
-                value=args.do_neg_id_prompt_weight,
-                visible=True
-            )
             seed = gr.Slider(
                 label="Seed",
                 minimum=0,
@@ -365,11 +384,6 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                 value=True,
                 info="Uncheck for reproducible results")
-            negative_prompt = gr.Textbox(
-                label="Negative Prompt",
-                placeholder="low quality",
-                value="(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime), text, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, bare breasts, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, long neck, UnrealisticDream",
-            )
             num_steps = gr.Slider(
                 label="Number of sampling steps. More steps for better composition, but longer time.",
                 minimum=30,
@@ -394,36 +408,42 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                 is_adaface_enabled = gr.Checkbox(label="Enable AdaFace",
                                                  info="Enable AdaFace for better face details. If unchecked, it falls back to ID-Animator (https://huggingface.co/spaces/ID-Animator/ID-Animator).",
                                                  value=True)
-                adaface_ckpt_path = gr.Textbox(
-                    label="AdaFace checkpoint path",
-                    placeholder=args.adaface_ckpt_path,
-                    value=args.adaface_ckpt_path,
-                )
                 adaface_power_scale = gr.Slider(
                         label="AdaFace Embedding Power Scale",
                         info="Increase this scale slightly only if the face is defocused or the face details are not clear",
                         minimum=0.8,
                         maximum=1.2,
                         step=0.1,
                         value=1,
                     )
                 image_embed_cfg_begin_scale = gr.Slider(
                         label="ID-Animator Image Embedding Initial Scale",
                         info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
-                        minimum=0.6,
-                        maximum=1.5,
                         step=0.1,
-                        value=1.0,
                     )
                 image_embed_cfg_end_scale = gr.Slider(
                         label="ID-Animator Image Embedding Final Scale",
                         info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
-                        minimum=0.3,
-                        maximum=1.5,
                         step=0.1,
-                        value=0.5,
                     )
                 id_animator_anneal_steps = gr.Slider(
@@ -431,18 +451,15 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                     minimum=0,
                     maximum=40,
                     step=1,
-                    value=20,
                     visible=True,
                 )
-                attn_scale = gr.Slider(
-                        label="ID-Animator Attention Processor Scale",
-                        info="The scale of the ID embeddings on the attention (the higher, the more focus on the face, less on the background)" ,
-                        minimum=0,
-                        maximum=2,
-                        step=0.1,
-                        value=1,
-                    )
         with gr.Column():
             result_video = gr.Video(label="Generated Animation", interactive=False)
@@ -462,8 +479,8 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                      outputs=seed,
                      queue=False,
                      api_name=False,
-                ).then(fn=gen_init_images, inputs=[uploaded_files_gallery, prompt,
-                                                   guidance_scale, do_neg_id_prompt_weight],
                        outputs=[uploaded_init_img_gallery, init_img_files, init_clear_button_column])
         uploaded_init_img_gallery.select(fn=get_clicked_image, inputs=None, outputs=init_img_selected_idx)
@@ -478,9 +495,10 @@ with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
                  fn=generate_video,
                  inputs=[image_container, files,
                          init_img_files, init_img_selected_idx, init_image_strength, init_image_final_weight,
-                         prompt, negative_prompt, num_steps, video_length, guidance_scale, do_neg_id_prompt_weight,
                          seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
-                         is_adaface_enabled, adaface_ckpt_path, adaface_power_scale, id_animator_anneal_steps],
                  outputs=[result_video]
         )

 parser.add_argument("--adaface_encoder_types", type=str, nargs="+", default=["consistentID", "arc2face"],
                     choices=["arc2face", "consistentID"], help="Type(s) of the ID2Ada prompt encoders")
 parser.add_argument('--adaface_ckpt_path', type=str,
+                    default='models/adaface/VGGface2_HQ_masks2025-03-06T03-31-21_zero3-ada-1000.pt')
+parser.add_argument('--model_style_type', type=str, default='photorealistic',
                     choices=["realistic", "anime", "photorealistic"], help="Type of the base model")
+parser.add_argument("--guidance_scale", type=float, default=8.0,
                     help="The guidance scale for the diffusion model. Default: 8.0")
 parser.add_argument('--gpu', type=int, default=None)
 parser.add_argument('--ip', type=str, default="0.0.0.0")
         seed = random.randint(0, MAX_SEED)
     return seed
+def is_running_on_spaces():
+    return os.getenv("SPACE_ID") is not None
+from huggingface_hub import snapshot_download
+large_files = ["models/*", "models/**/*"]
+snapshot_download(repo_id="adaface-neurips/adaface-animate-models",
+                  repo_type="model", allow_patterns=large_files, local_dir=".")
+os.makedirs("/tmp/gradio", exist_ok=True)
 # model = load_model()
 # This FaceAnalysis is just to crop the face areas from the uploaded images,
 # and is independent of the adaface FaceAnalysis apps.
+app = FaceAnalysis(name="buffalo_l", root='models/insightface', providers=['CPUExecutionProvider'])
 app.prepare(ctx_id=0, det_size=(320, 320))
+if is_running_on_spaces():
+    device = 'cuda:0'
+else:
+    if args.gpu is None:
+        device = "cuda"
+    else:
+        device = f"cuda:{args.gpu}"
+print(f"Device: {device}")
 global adaface, id_animator
+adaface_base_model_path = model_style_type2base_model_path["photorealistic"]
 id_animator = load_model(model_style_type=args.model_style_type, device='cpu')
+adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path=adaface_base_model_path,
                          adaface_encoder_types=args.adaface_encoder_types,
+                         adaface_ckpt_paths=args.adaface_ckpt_path, device='cpu')
 basedir = os.getcwd()
 savedir = os.path.join(basedir,'samples')
     return data.index
 @spaces.GPU
+def gen_init_images(uploaded_image_paths, prompt, highlight_face, guidance_scale, out_image_count=4):
     if uploaded_image_paths is None:
         print("No image uploaded")
         return None, None, None
     # [('/tmp/gradio/249981e66a7c665aaaf1c7eaeb24949af4366c88/jensen huang.jpg', None)]
     # Extract the file paths.
     uploaded_image_paths = [path[0] for path in uploaded_image_paths]
+    with torch.no_grad():
+        adaface_subj_embs = \
+            adaface.prepare_adaface_embeddings(image_paths=uploaded_image_paths, face_id_embs=None,
+                                            update_text_encoder=True)
     if adaface_subj_embs is None:
         raise gr.Error(f"Failed to detect any faces! Please try with other images")
     # Generate two images each time for the user to select from.
     noise = torch.randn(out_image_count, 3, 512, 512)
+    if highlight_face and "face portrait" not in prompt:
         if "portrait" in prompt:
             # Enhance the face features by replacing "portrait" with "face portrait".
             prompt = prompt.replace("portrait", "face portrait")
         else:
             prompt = "face portrait, " + prompt
+    guidance_scale = min(guidance_scale, 5)
     # samples: A list of PIL Image instances.
     with torch.no_grad():
         samples = adaface(noise, prompt, placeholder_tokens_pos='append',
                           guidance_scale=guidance_scale,
+                          out_image_count=out_image_count,
+                          repeat_prompt_for_each_encoder=True,
+                          verbose=True)
     face_paths = []
     for sample in samples:
 @spaces.GPU(duration=90)
 def generate_video(image_container, uploaded_image_paths, init_img_file_paths, init_img_selected_idx,
                    init_image_strength, init_image_final_weight,
+                   prompt, negative_prompt, num_steps, video_length, guidance_scale,
                    seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
+                   highlight_face, is_adaface_enabled, adaface_power_scale,
                    id_animator_anneal_steps, progress=gr.Progress(track_tqdm=True)):
     global adaface, id_animator
     if prompt is None:
         prompt = ""
+    #prompt = prompt + " 8k uhd, high quality"
+    #if " shot" not in prompt:
+    #    prompt = prompt + ", medium shot"
+    if highlight_face and "face portrait" not in prompt:
+        if "portrait" in prompt:
+            # Enhance the face features by replacing "portrait" with "face portrait".
+            prompt = prompt.replace("portrait", "face portrait")
+        else:
+            prompt = "face portrait, " + prompt
     prompt_img_lists=[]
     for path in uploaded_image_paths:
         img = cv2.imread(path)
         # prompt_img_lists is a list of PIL images.
         prompt_img_lists.append(load_image(face_path).resize((224,224)))
+    if adaface is None or (not is_adaface_enabled):
         adaface_prompt_embeds, negative_prompt_embeds = None, None
+        # ID-Animator Image Embedding Initial and End Scales
         image_embed_cfg_scales = (1, 1)
     else:
         with torch.no_grad():
             adaface_subj_embs = \
                 adaface.prepare_adaface_embeddings(image_paths=uploaded_image_paths, face_id_embs=None,
             # adaface_prompt_embeds: [1, 77, 768].
             adaface_prompt_embeds, negative_prompt_embeds, _, _ = \
                 adaface.encode_prompt(prompt, placeholder_tokens_pos='append',
+                                      repeat_prompt_for_each_encoder=True,
                                       verbose=True)
+        # ID-Animator Image Embedding Initial and End Scales
         image_embed_cfg_scales = (image_embed_cfg_begin_scale, image_embed_cfg_end_scale)
     # init_img_file_paths is a list of image paths. If not chose, init_img_file_paths is None.
                                   prompt                = prompt,
                                   negative_prompt       = negative_prompt,
                                   adaface_prompt_embeds = (adaface_prompt_embeds, negative_prompt_embeds),
+                                  # adaface_power_scale is not so useful, and when it's set >= 1.2, weird artifacts appear.
+                                  # Here it's limited to 1~1.1.
                                   adaface_power_scale   = adaface_power_scale,
                                   num_inference_steps   = num_steps,
                                   id_animator_anneal_steps  = id_animator_anneal_steps,
     save_videos_grid(sample, save_sample_path)
     return save_sample_path
+def check_prompt_and_model_type(prompt, model_style_type, progress=gr.Progress()):
     global adaface, id_animator
     model_style_type = model_style_type.lower()
 with gr.Blocks(css=css, theme=gr.themes.Origin()) as demo:
     gr.Markdown(
         """
+        # AdaFace-Animate: Zero-Shot Human Subject-Driven Video Generation
         """
     )
     gr.Markdown(
         """
+<b>Official demo</b> for our working paper <b>AdaFace: A Versatile Text-space Face Encoder for Face Synthesis and Processing</b>.<br>
+❗️**NOTE**❗️
+- Support switching between three model styles: **Realistic**, **Photorealistic** and **Anime**. **Realistic** is less realistic than **Photorealistic** but has better motions.
+- If you change the model style, please wait for 20~30 seconds for loading new model weight before the model begins to generate images/videos.
 ❗️**Tips**❗️
 - You can upload one or more subject images for generating ID-specific video.
+- If the face loses focus, try enabling "Highlight face".
 - If the motion is weird, e.g., the prompt is "... running", try increasing the number of sampling steps.
 - Usage explanations and demos: [Readme](https://huggingface.co/spaces/adaface-neurips/adaface-animate/blob/main/README2.md).
 - AdaFace Text-to-Image: <a href="https://huggingface.co/spaces/adaface-neurips/adaface" style="display: inline-flex; align-items: center;">
   <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow" alt="Hugging Face Spaces" style="margin-left: 5px;">
   </a>
         """
     )
                         file_types=["image"],
                         file_count="multiple"
                     )
+            files.GRADIO_CACHE = "/tmp/gradio"
             image_container = gr.Image(label="image container", sources="upload", type="numpy", height=256, visible=False)
             uploaded_files_gallery = gr.Gallery(label="Subject images", visible=False, columns=3, rows=2, height=300)
             with gr.Column(visible=False) as clear_button_column:
                             file_types=["image"],
                             file_count="multiple"
                     )
+            init_img_files.GRADIO_CACHE = "/tmp/gradio"
             init_img_container = gr.Image(label="init image container", sources="upload", type="numpy", height=256, visible=False)
             # Although there's only one image, we still use columns=3, to scale down the image size.
             # Otherwise it will occupy the full width, and the gallery won't show the whole image.
             init_img_selected_idx = gr.Textbox(label="Selected init image index", value="0", visible=False)
             with gr.Column(visible=True) as init_gen_button_column:
+                gen_init = gr.Button(value="Generate 4 new init images")
             with gr.Column(visible=False) as init_clear_button_column:
                 remove_init_and_reupload = gr.ClearButton(value="Upload an old init image", components=init_img_files, size="sm")
             prompt = gr.Dropdown(label="Prompt",
+                    info="Try something like 'walking on the beach'.",
+                    value="highlighted hair, futuristic silver armor suit, confident stance, living room, smiling, head tilted, perfect smooth skin",
+                    allow_custom_value=True,
+                    choices=[
+                            "portrait, highlighted hair, futuristic silver armor suit, confident stance, living room, smiling, head tilted, perfect smooth skin",
+                            "portrait, walking on the beach, sunset",
+                            "portrait, in a white apron and chef hat, garnishing a gourmet dish",
+                            "portrait, dancing pose among folks in a park, waving hands",
+                            "portrait, in iron man costume, the sky ablaze with hues of orange and purple",
+                            "portrait, jedi wielding a lightsaber, star wars",
+                            "portrait, night view of tokyo street, neon light",
+                            "portrait, playing guitar on a boat, ocean waves",
+                            "portrait, with a passion for reading, curled up with a book in a cozy nook near a window",
+                            "portrait, celebrating new year, fireworks",
+                            "portrait, running pose in a park",
+                            "portrait, in space suit, space helmet, walking on mars",
+                            "portrait, in superman costume, the sky ablaze with hues of orange and purple"
+                    ])
+            highlight_face = gr.Checkbox(label="Highlight face", value=False,
+                                         info="Enhance the facial features by prepending 'face portrait' to the prompt",
+                                         visible=True)
             init_image_strength = gr.Slider(
                     label="Init Image Strength",
                     info="How much the init image should influence each frame. 0: no influence (scenes are more dynamic), 3: strongest influence (scenes are more static).",
                     minimum=0,
+                    maximum=3,
+                    step=0.1,
                     value=1,
                 )
             init_image_final_weight = gr.Slider(
+                    label="Final Strength of the Init Image",
                     info="How much the init image should influence the end of the video",
                     minimum=0,
+                    maximum=2,
                     step=0.025,
                     value=0.1,
                 )
                 label="Base Model Style Type",
                 info="Switching the base model type will take 10~20 seconds to reload the model",
                 value=args.model_style_type.capitalize(),
+                choices=["Realistic", "Anime", "Photorealistic"],
                 allow_custom_value=False,
                 filterable=False,
             )
                 value=args.guidance_scale,
             )
             seed = gr.Slider(
                 label="Seed",
                 minimum=0,
                 value=True,
                 info="Uncheck for reproducible results")
             num_steps = gr.Slider(
                 label="Number of sampling steps. More steps for better composition, but longer time.",
                 minimum=30,
                 is_adaface_enabled = gr.Checkbox(label="Enable AdaFace",
                                                  info="Enable AdaFace for better face details. If unchecked, it falls back to ID-Animator (https://huggingface.co/spaces/ID-Animator/ID-Animator).",
                                                  value=True)
                 adaface_power_scale = gr.Slider(
                         label="AdaFace Embedding Power Scale",
                         info="Increase this scale slightly only if the face is defocused or the face details are not clear",
                         minimum=0.8,
                         maximum=1.2,
+                        step=0.05,
+                        value=1.1,
+                        visible=True,
+                    )
+                attn_scale = gr.Slider(
+                        label="Attention Processor Scale",
+                        info="The scale of the ID embeddings on the attention (the higher, the more focus on the face, less on the background)" ,
+                        minimum=0.5,
+                        maximum=2,
                         step=0.1,
                         value=1,
+                        visible=True
                     )
                 image_embed_cfg_begin_scale = gr.Slider(
                         label="ID-Animator Image Embedding Initial Scale",
                         info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
+                        minimum=0,
+                        maximum=1,
                         step=0.1,
+                        value=0.5,
                     )
                 image_embed_cfg_end_scale = gr.Slider(
                         label="ID-Animator Image Embedding Final Scale",
                         info="The scale of the ID-Animator image embedding (influencing coarse facial features and poses)",
+                        minimum=0,
+                        maximum=1,
                         step=0.1,
+                        value=0.1,
                     )
                 id_animator_anneal_steps = gr.Slider(
                     minimum=0,
                     maximum=40,
                     step=1,
+                    value=40,
                     visible=True,
                 )
+                negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    placeholder="low quality",
+                    value="deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, text, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, bare breasts, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, long neck, UnrealisticDream, nude, naked, nsfw, topless, bare breasts",
+                )
         with gr.Column():
             result_video = gr.Video(label="Generated Animation", interactive=False)
                      outputs=seed,
                      queue=False,
                      api_name=False,
+                ).then(fn=gen_init_images, inputs=[uploaded_files_gallery, prompt, highlight_face,
+                                                   guidance_scale],
                        outputs=[uploaded_init_img_gallery, init_img_files, init_clear_button_column])
         uploaded_init_img_gallery.select(fn=get_clicked_image, inputs=None, outputs=init_img_selected_idx)
                  fn=generate_video,
                  inputs=[image_container, files,
                          init_img_files, init_img_selected_idx, init_image_strength, init_image_final_weight,
+                         prompt, negative_prompt, num_steps, video_length, guidance_scale,
                          seed, attn_scale, image_embed_cfg_begin_scale, image_embed_cfg_end_scale,
+                         highlight_face, is_adaface_enabled,
+                         adaface_power_scale, id_animator_anneal_steps],
                  outputs=[result_video]
         )

faceadapter/face_adapter.py CHANGED Viewed

@@ -315,10 +315,10 @@ class FaceAdapterPlusForVideoLora(FaceAdapterLora):
                 negative_prompt_embeds0 = negative_prompt_embeds_
                 adaface_prompt_embeds, negative_prompt_embeds_ = adaface_prompt_embeds
                 # self.torch_type == torch.float16. adaface_prompt_embeds is torch.float32.
-                prompt_embeds_ = adaface_prompt_embeds.repeat(num_samples, 1, 1).to(dtype=self.torch_type)
                 negative_prompt_embeds_ = negative_prompt_embeds_.repeat(num_samples, 1, 1).to(dtype=self.torch_type)
                 if adaface_power_scale != 1.0:
-                    prompt_embeds_ = prompt_embeds_ * adaface_power_scale - negative_prompt_embeds0 * (1 - adaface_power_scale)
                 # Note to balance image_prompt_embeds with uncond_image_prompt_embeds after scaling.
                 image_prompt_embeds_begin = image_prompt_embeds * image_embed_cfg_scales[0] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[0])

                 negative_prompt_embeds0 = negative_prompt_embeds_
                 adaface_prompt_embeds, negative_prompt_embeds_ = adaface_prompt_embeds
                 # self.torch_type == torch.float16. adaface_prompt_embeds is torch.float32.
+                prompt_embeds_          = adaface_prompt_embeds.repeat(num_samples, 1, 1).to(dtype=self.torch_type)
                 negative_prompt_embeds_ = negative_prompt_embeds_.repeat(num_samples, 1, 1).to(dtype=self.torch_type)
                 if adaface_power_scale != 1.0:
+                    prompt_embeds_ = prompt_embeds_ * adaface_power_scale + negative_prompt_embeds0 * (1 - adaface_power_scale)
                 # Note to balance image_prompt_embeds with uncond_image_prompt_embeds after scaling.
                 image_prompt_embeds_begin = image_prompt_embeds * image_embed_cfg_scales[0] + uncond_image_prompt_embeds * (1 - image_embed_cfg_scales[0])

infer.py CHANGED Viewed

@@ -17,7 +17,7 @@ model_style_type2base_model_path = {
 def load_model(model_style_type="realistic", device="cuda"):
     inference_config    = "inference-v2.yaml"
-    sd_version          = "animatediff/sd"
     id_ckpt             = "models/animator.ckpt"
     image_encoder_path  = "models/image_encoder"
@@ -73,7 +73,7 @@ def load_model(model_style_type="realistic", device="cuda"):
             converted_vae_checkpoint = convert_ldm_vae_checkpoint(dreambooth_state_dict, pipeline.vae.config)
             # print(vae)
-            #vae ->to_q,to_k,to_v
             # print(converted_vae_checkpoint)
             convert_vae_keys = list(converted_vae_checkpoint.keys())
             for key in convert_vae_keys:
@@ -93,7 +93,8 @@ def load_model(model_style_type="realistic", device="cuda"):
             pipeline.vae.load_state_dict(converted_vae_checkpoint)
             converted_unet_checkpoint = convert_ldm_unet_checkpoint(dreambooth_state_dict, pipeline.unet.config)
-            pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
             pipeline.text_encoder = convert_ldm_clip_checkpoint(dreambooth_state_dict, dtype=torch.float16).to(device=device)

 def load_model(model_style_type="realistic", device="cuda"):
     inference_config    = "inference-v2.yaml"
+    sd_version          = "models/animatediff/sd"
     id_ckpt             = "models/animator.ckpt"
     image_encoder_path  = "models/image_encoder"
             converted_vae_checkpoint = convert_ldm_vae_checkpoint(dreambooth_state_dict, pipeline.vae.config)
             # print(vae)
+            # vae ->to_q, to_k, to_v
             # print(converted_vae_checkpoint)
             convert_vae_keys = list(converted_vae_checkpoint.keys())
             for key in convert_vae_keys:
             pipeline.vae.load_state_dict(converted_vae_checkpoint)
             converted_unet_checkpoint = convert_ldm_unet_checkpoint(dreambooth_state_dict, pipeline.unet.config)
+            m, u = pipeline.unet.load_state_dict(converted_unet_checkpoint, strict=False)
+            print(f"### custom unet missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
             pipeline.text_encoder = convert_ldm_clip_checkpoint(dreambooth_state_dict, dtype=torch.float16).to(device=device)

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 diffusers==0.29.2
-torch
 torchvision
 imageio
 imageio-ffmpeg

 diffusers==0.29.2
+torch==2.4.1
 torchvision
 imageio
 imageio-ffmpeg