Spaces:

Chaerin5
/

FoundHand

Runtime error

App Files Files Community

Chaerin5 commited on Dec 21, 2024

Commit

49f816b

1 Parent(s): 76e0b86

init

Browse files

Files changed (19) hide show

README.md +1 -13
app.py +1581 -0
diffusion/__init__.py +46 -0
diffusion/__pycache__/__init__.cpython-38.pyc +0 -0
diffusion/__pycache__/diffusion_utils.cpython-38.pyc +0 -0
diffusion/__pycache__/gaussian_diffusion.cpython-38.pyc +0 -0
diffusion/__pycache__/respace.cpython-38.pyc +0 -0
diffusion/__pycache__/scheduler.cpython-38.pyc +0 -0
diffusion/diffusion_utils.py +88 -0
diffusion/gaussian_diffusion.py +1118 -0
diffusion/respace.py +129 -0
diffusion/scheduler.py +224 -0
diffusion/timestep_sampler.py +150 -0
packages.txt +1 -0
requirements.txt +223 -0
segment_hoi.py +111 -0
utils.py +289 -0
vit.py +323 -0
vqvae.py +507 -0

README.md CHANGED Viewed

@@ -1,13 +1 @@
----
-title: FoundHand
-emoji: 🏆
-colorFrom: gray
-colorTo: purple
-sdk: gradio
-sdk_version: 5.9.1
-app_file: app.py
-pinned: false
-short_description: FoundHand
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand Image Generation

app.py ADDED Viewed

	@@ -0,0 +1,1581 @@

+import torch
+from dataclasses import dataclass
+import gradio as gr
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+import mediapipe as mp
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+import vqvae
+import vit
+from typing import Literal
+from diffusion import create_diffusion
+from utils import scale_keypoint, keypoint_heatmap, check_keypoints_validity
+from segment_hoi import init_sam
+from io import BytesIO
+from PIL import Image
+import random
+from copy import deepcopy
+from typing import Optional
+MAX_N = 6
+FIX_MAX_N = 6
+placeholder = cv2.cvtColor(cv2.imread("placeholder.png"), cv2.COLOR_BGR2RGB)
+NEW_MODEL = True
+MODEL_EPOCH = 6
+REF_POSE_MASK = True
+def set_seed(seed):
+    seed = int(seed)
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    random.seed(seed)
+def remove_prefix(text, prefix):
+    if text.startswith(prefix):
+        return text[len(prefix) :]
+    return text
+def unnormalize(x):
+    return (((x + 1) / 2) * 255).astype(np.uint8)
+def visualize_hand(all_joints, img, side=["right", "left"], n_avail_joints=21):
+    # Define the connections between joints for drawing lines and their corresponding colors
+    connections = [
+        ((0, 1), "red"),
+        ((1, 2), "green"),
+        ((2, 3), "blue"),
+        ((3, 4), "purple"),
+        ((0, 5), "orange"),
+        ((5, 6), "pink"),
+        ((6, 7), "brown"),
+        ((7, 8), "cyan"),
+        ((0, 9), "yellow"),
+        ((9, 10), "magenta"),
+        ((10, 11), "lime"),
+        ((11, 12), "indigo"),
+        ((0, 13), "olive"),
+        ((13, 14), "teal"),
+        ((14, 15), "navy"),
+        ((15, 16), "gray"),
+        ((0, 17), "lavender"),
+        ((17, 18), "silver"),
+        ((18, 19), "maroon"),
+        ((19, 20), "fuchsia"),
+    ]
+    H, W, C = img.shape
+    # Create a figure and axis
+    plt.figure()
+    ax = plt.gca()
+    # Plot joints as points
+    ax.imshow(img)
+    start_is = []
+    if "right" in side:
+        start_is.append(0)
+    if "left" in side:
+        start_is.append(21)
+    for start_i in start_is:
+        joints = all_joints[start_i : start_i + n_avail_joints]
+        if len(joints) == 1:
+            ax.scatter(joints[0][0], joints[0][1], color="red", s=10)
+        else:
+            for connection, color in connections[: len(joints) - 1]:
+                joint1 = joints[connection[0]]
+                joint2 = joints[connection[1]]
+                ax.plot([joint1[0], joint2[0]], [joint1[1], joint2[1]], color=color)
+    ax.set_xlim([0, W])
+    ax.set_ylim([0, H])
+    ax.grid(False)
+    ax.set_axis_off()
+    ax.invert_yaxis()
+    # plt.subplots_adjust(wspace=0.01)
+    # plt.show()
+    buf = BytesIO()
+    plt.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
+    plt.close()
+    # Convert BytesIO object to numpy array
+    buf.seek(0)
+    img_pil = Image.open(buf)
+    img_pil = img_pil.resize((H, W))
+    numpy_img = np.array(img_pil)
+    return numpy_img
+def mask_image(image, mask, color=[0, 0, 0], alpha=0.6, transparent=True):
+    """Overlay mask on image for visualization purpose.
+    Args:
+        image (H, W, 3) or (H, W): input image
+        mask (H, W): mask to be overlaid
+        color: the color of overlaid mask
+        alpha: the transparency of the mask
+    """
+    out = deepcopy(image)
+    img = deepcopy(image)
+    img[mask == 1] = color
+    if transparent:
+        out = cv2.addWeighted(img, alpha, out, 1 - alpha, 0, out)
+    else:
+        out = img
+    return out
+def scale_keypoint(keypoint, original_size, target_size):
+    """Scale a keypoint based on the resizing of the image."""
+    keypoint_copy = keypoint.copy()
+    keypoint_copy[:, 0] *= target_size[0] / original_size[0]
+    keypoint_copy[:, 1] *= target_size[1] / original_size[1]
+    return keypoint_copy
+print("Configure...")
+@dataclass
+class HandDiffOpts:
+    run_name: str = "ViT_256_handmask_heatmap_nvs_b25_lr1e-5"
+    sd_path: str = "/users/kchen157/scratch/weights/SD/sd-v1-4.ckpt"
+    log_dir: str = "/users/kchen157/scratch/log"
+    data_root: str = "/users/kchen157/data/users/kchen157/dataset/handdiff"
+    image_size: tuple = (256, 256)
+    latent_size: tuple = (32, 32)
+    latent_dim: int = 4
+    mask_bg: bool = False
+    kpts_form: str = "heatmap"
+    n_keypoints: int = 42
+    n_mask: int = 1
+    noise_steps: int = 1000
+    test_sampling_steps: int = 250
+    ddim_steps: int = 100
+    ddim_discretize: str = "uniform"
+    ddim_eta: float = 0.0
+    beta_start: float = 8.5e-4
+    beta_end: float = 0.012
+    latent_scaling_factor: float = 0.18215
+    cfg_pose: float = 5.0
+    cfg_appearance: float = 3.5
+    batch_size: int = 25
+    lr: float = 1e-5
+    max_epochs: int = 500
+    log_every_n_steps: int = 100
+    limit_val_batches: int = 1
+    n_gpu: int = 8
+    num_nodes: int = 1
+    precision: str = "16-mixed"
+    profiler: str = "simple"
+    swa_epoch_start: int = 10
+    swa_lrs: float = 1e-3
+    num_workers: int = 10
+    n_val_samples: int = 4
+if not torch.cuda.is_available():
+    raise ValueError("No GPU")
+# load models
+if NEW_MODEL:
+    opts = HandDiffOpts()
+    if MODEL_EPOCH == 7:
+        model_path = './DINO_EMA_11M_b50_lr1e-5_epoch7_step380k.ckpt'
+    elif MODEL_EPOCH == 6:
+        # model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt"
+        model_path = hf_hub_download(repo_id="Chaerin5/FoundHand-weights", filename="DINO_EMA_11M_b50_lr1e-5_epoch6_step320k.ckpt")
+    elif MODEL_EPOCH == 4:
+        model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch4_step210k.ckpt"
+    elif MODEL_EPOCH == 10:
+        model_path = "./DINO_EMA_11M_b50_lr1e-5_epoch10_step550k.ckpt"
+    else:
+        raise ValueError(f"new model epoch should be either 6 or 7, got {MODEL_EPOCH}")
+    vae_path = './vae-ft-mse-840000-ema-pruned.ckpt'
+    # sd_path = './sd-v1-4.ckpt'
+    print('Load diffusion model...')
+    diffusion = create_diffusion(str(opts.test_sampling_steps))
+    model = vit.DiT_XL_2(
+        input_size=opts.latent_size[0],
+        latent_dim=opts.latent_dim,
+        in_channels=opts.latent_dim+opts.n_keypoints+opts.n_mask,
+        learn_sigma=True,
+    ).cuda()
+    # ckpt_state_dict = torch.load(model_path)['model_state_dict']
+    ckpt_state_dict = torch.load(model_path, map_location=torch.device('cuda'))['ema_state_dict']
+    missing_keys, extra_keys = model.load_state_dict(ckpt_state_dict, strict=False)
+    model.eval()
+    print(missing_keys, extra_keys)
+    assert len(missing_keys) == 0
+    vae_state_dict = torch.load(vae_path)['state_dict']
+    autoencoder = vqvae.create_model(3, 3, opts.latent_dim).eval().requires_grad_(False).cuda()
+    missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False)
+    autoencoder.eval()
+    assert len(missing_keys) == 0
+else:
+    opts = HandDiffOpts()
+    model_path = './finetune_epoch=5-step=130000.ckpt'
+    sd_path = './sd-v1-4.ckpt'
+    print('Load diffusion model...')
+    diffusion = create_diffusion(str(opts.test_sampling_steps))
+    model = vit.DiT_XL_2(
+        input_size=opts.latent_size[0],
+        latent_dim=opts.latent_dim,
+        in_channels=opts.latent_dim+opts.n_keypoints+opts.n_mask,
+        learn_sigma=True,
+    ).cuda()
+    ckpt_state_dict = torch.load(model_path)['state_dict']
+    dit_state_dict = {remove_prefix(k, 'diffusion_backbone.'): v for k, v in ckpt_state_dict.items() if k.startswith('diffusion_backbone')}
+    vae_state_dict = {remove_prefix(k, 'autoencoder.'): v for k, v in ckpt_state_dict.items() if k.startswith('autoencoder')}
+    missing_keys, extra_keys = model.load_state_dict(dit_state_dict, strict=False)
+    model.eval()
+    assert len(missing_keys) == 0 and len(extra_keys) == 0
+    autoencoder = vqvae.create_model(3, 3, opts.latent_dim).eval().requires_grad_(False).cuda()
+    missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False)
+    autoencoder.eval()
+    assert len(missing_keys) == 0 and len(extra_keys) == 0
+sam_predictor = init_sam(ckpt_path="./sam_vit_h_4b8939.pth")
+print("Mediapipe hand detector and SAM ready...")
+mp_hands = mp.solutions.hands
+hands = mp_hands.Hands(
+    static_image_mode=True,  # Use False if image is part of a video stream
+    max_num_hands=2,  # Maximum number of hands to detect
+    min_detection_confidence=0.1,
+)
+def get_ref_anno(ref):
+    if ref is None:
+        return (
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+    img = ref["composite"][..., :3]
+    img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA)
+    keypts = np.zeros((42, 2))
+    if REF_POSE_MASK:
+        mp_pose = hands.process(img)
+        detected = np.array([0, 0])
+        start_idx = 0
+        if mp_pose.multi_hand_landmarks:
+            # handedness is flipped assuming the input image is mirrored in MediaPipe
+            for hand_landmarks, handedness in zip(
+                mp_pose.multi_hand_landmarks, mp_pose.multi_handedness
+            ):
+                # actually right hand
+                if handedness.classification[0].label == "Left":
+                    start_idx = 0
+                    detected[0] = 1
+                # actually left hand
+                elif handedness.classification[0].label == "Right":
+                    start_idx = 21
+                    detected[1] = 1
+                for i, landmark in enumerate(hand_landmarks.landmark):
+                    keypts[start_idx + i] = [
+                        landmark.x * opts.image_size[1],
+                        landmark.y * opts.image_size[0],
+                    ]
+            sam_predictor.set_image(img)
+            l = keypts[:21].shape[0]
+            if keypts[0].sum() != 0 and keypts[21].sum() != 0:
+                input_point = np.array([keypts[0], keypts[21]])
+                input_label = np.array([1, 1])
+            elif keypts[0].sum() != 0:
+                input_point = np.array(keypts[:1])
+                input_label = np.array([1])
+            elif keypts[21].sum() != 0:
+                input_point = np.array(keypts[21:22])
+                input_label = np.array([1])
+            masks, _, _ = sam_predictor.predict(
+                point_coords=input_point,
+                point_labels=input_label,
+                multimask_output=False,
+            )
+            hand_mask = masks[0]
+            masked_img = img * hand_mask[..., None] + 255 * (1 - hand_mask[..., None])
+            ref_pose = visualize_hand(keypts, masked_img)
+        else:
+            raise gr.Error("No hands detected in the reference image.")
+    else:
+        hand_mask = np.zeros_like(img[:,:, 0])
+        ref_pose = np.zeros_like(img)
+    def make_ref_cond(
+        img,
+        keypts,
+        hand_mask,
+        device="cuda",
+        target_size=(256, 256),
+        latent_size=(32, 32),
+    ):
+        image_transform = Compose(
+            [
+                ToTensor(),
+                Resize(target_size),
+                Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+        image = image_transform(img).to(device)
+        kpts_valid = check_keypoints_validity(keypts, target_size)
+        heatmaps = torch.tensor(
+            keypoint_heatmap(
+                scale_keypoint(keypts, target_size, latent_size), latent_size, var=1.0
+            )
+            * kpts_valid[:, None, None],
+            dtype=torch.float,
+            device=device,
+        )[None, ...]
+        mask = torch.tensor(
+            cv2.resize(
+                hand_mask.astype(int),
+                dsize=latent_size,
+                interpolation=cv2.INTER_NEAREST,
+            ),
+            dtype=torch.float,
+            device=device,
+        ).unsqueeze(0)[None, ...]
+        return image[None, ...], heatmaps, mask
+    image, heatmaps, mask = make_ref_cond(
+        img,
+        keypts,
+        hand_mask,
+        device="cuda",
+        target_size=opts.image_size,
+        latent_size=opts.latent_size,
+    )
+    latent = opts.latent_scaling_factor * autoencoder.encode(image).sample()
+    if not REF_POSE_MASK:
+        heatmaps = torch.zeros_like(heatmaps)
+        mask = torch.zeros_like(mask)
+    ref_cond = torch.cat([latent, heatmaps, mask], 1)
+    return img, ref_pose, ref_cond
+def get_target_anno(target):
+    if target is None:
+        return (
+            gr.State.update(value=None),
+            gr.Image.update(value=None),
+            gr.State.update(value=None),
+            gr.State.update(value=None),
+        )
+    pose_img = target["composite"][..., :3]
+    pose_img = cv2.resize(pose_img, opts.image_size, interpolation=cv2.INTER_AREA)
+    # detect keypoints
+    mp_pose = hands.process(pose_img)
+    target_keypts = np.zeros((42, 2))
+    detected = np.array([0, 0])
+    start_idx = 0
+    if mp_pose.multi_hand_landmarks:
+        # handedness is flipped assuming the input image is mirrored in MediaPipe
+        for hand_landmarks, handedness in zip(
+            mp_pose.multi_hand_landmarks, mp_pose.multi_handedness
+        ):
+            # actually right hand
+            if handedness.classification[0].label == "Left":
+                start_idx = 0
+                detected[0] = 1
+            # actually left hand
+            elif handedness.classification[0].label == "Right":
+                start_idx = 21
+                detected[1] = 1
+            for i, landmark in enumerate(hand_landmarks.landmark):
+                target_keypts[start_idx + i] = [
+                    landmark.x * opts.image_size[1],
+                    landmark.y * opts.image_size[0],
+                ]
+        target_pose = visualize_hand(target_keypts, pose_img)
+        kpts_valid = check_keypoints_validity(target_keypts, opts.image_size)
+        target_heatmaps = torch.tensor(
+            keypoint_heatmap(
+                scale_keypoint(target_keypts, opts.image_size, opts.latent_size),
+                opts.latent_size,
+                var=1.0,
+            )
+            * kpts_valid[:, None, None],
+            dtype=torch.float,
+            device="cuda",
+        )[None, ...]
+        target_cond = torch.cat(
+            [target_heatmaps, torch.zeros_like(target_heatmaps)[:, :1]], 1
+        )
+    else:
+        raise gr.Error("No hands detected in the target image.")
+    return pose_img, target_pose, target_cond, target_keypts
+# def draw_grid(ref):
+#     if ref is None or ref["composite"] is None: # or len(ref["layers"])==0:
+#         return ref
+#     # if len(ref["layers"]) == 1:
+#     #     need_draw = True
+#     # # elif ref["composite"].shape[0] != size_memory[0] or ref["composite"].shape[1] != size_memory[1]:
+#     # #     need_draw = True
+#     # else:
+#     #     need_draw = False
+#     # size_memory = ref["composite"].shape[0], ref["composite"].shape[1]
+#     # if not need_draw:
+#     #     return size_memory, ref
+#     h, w = ref["composite"].shape[:2]
+#     grid_h, grid_w = h // 32, w // 32
+#     # grid = np.zeros((h, w, 4), dtype=np.uint8)
+#     for i in range(1, grid_h):
+#         ref["composite"][i * 32, :, :3] = 255 # 0.5 * ref["composite"][i * 32, :, :3] +
+#     for i in range(1, grid_w):
+#         ref["composite"][:, i * 32, :3] = 255 # 0.5 * ref["composite"][:, i * 32, :3] +
+#     # if len(ref["layers"]) == 1:
+#     #     ref["layers"].append(grid)
+#     # else:
+#     #     ref["layers"][1] = grid
+#     return ref["composite"]
+def get_mask_inpaint(ref):
+    inpaint_mask = np.array(ref["layers"][0])[..., -1]
+    inpaint_mask = cv2.resize(
+        inpaint_mask, opts.image_size, interpolation=cv2.INTER_AREA
+    )
+    inpaint_mask = (inpaint_mask >= 128).astype(np.uint8)
+    return inpaint_mask
+def visualize_ref(crop, brush):
+    if crop is None or brush is None:
+        return None
+    inpainted = brush["layers"][0][..., -1]
+    img = crop["background"][..., :3]
+    img = cv2.resize(img, inpainted.shape[::-1], interpolation=cv2.INTER_AREA)
+    mask = inpainted < 128
+    # img = img.astype(np.int32)
+    # img[mask, :] = img[mask, :] - 50
+    # img[np.any(img<0, axis=-1)]=0
+    # img = img.astype(np.uint8)
+    img = mask_image(img, mask)
+    return img
+def get_kps(img, keypoints, side: Literal["right", "left"], evt: gr.SelectData):
+    if keypoints is None:
+        keypoints = [[], []]
+    kps = np.zeros((42, 2))
+    if side == "right":
+        if len(keypoints[0]) == 21:
+            gr.Info("21 keypoints for right hand already selected. Try reset if something looks wrong.")
+        else:
+            keypoints[0].append(list(evt.index))
+        len_kps = len(keypoints[0])
+        kps[:len_kps] = np.array(keypoints[0])
+    elif side == "left":
+        if len(keypoints[1]) == 21:
+            gr.Info("21 keypoints for left hand already selected. Try reset if something looks wrong.")
+        else:
+            keypoints[1].append(list(evt.index))
+        len_kps = len(keypoints[1])
+        kps[21 : 21 + len_kps] = np.array(keypoints[1])
+    vis_hand = visualize_hand(kps, img, side, len_kps)
+    return vis_hand, keypoints
+def undo_kps(img, keypoints, side: Literal["right", "left"]):
+    if keypoints is None:
+        return img, None
+    kps = np.zeros((42, 2))
+    if side == "right":
+        if len(keypoints[0]) == 0:
+            return img, keypoints
+        keypoints[0].pop()
+        len_kps = len(keypoints[0])
+        kps[:len_kps] = np.array(keypoints[0])
+    elif side == "left":
+        if len(keypoints[1]) == 0:
+            return img, keypoints
+        keypoints[1].pop()
+        len_kps = len(keypoints[1])
+        kps[21 : 21 + len_kps] = np.array(keypoints[1])
+    vis_hand = visualize_hand(kps, img, side, len_kps)
+    return vis_hand, keypoints
+def reset_kps(img, keypoints, side: Literal["right", "left"]):
+    if keypoints is None:
+        return img, None
+    if side == "right":
+        keypoints[0] = []
+    elif side == "left":
+        keypoints[1] = []
+    return img, keypoints
+def sample_diff(ref_cond, target_cond, target_keypts, num_gen, seed, cfg):
+    set_seed(seed)
+    z = torch.randn(
+        (num_gen, opts.latent_dim, opts.latent_size[0], opts.latent_size[1]),
+        device="cuda",
+    )
+    target_cond = target_cond.repeat(num_gen, 1, 1, 1)
+    ref_cond = ref_cond.repeat(num_gen, 1, 1, 1)
+    # novel view synthesis mode = off
+    nvs = torch.zeros(num_gen, dtype=torch.int, device="cuda")
+    z = torch.cat([z, z], 0)
+    model_kwargs = dict(
+        target_cond=torch.cat([target_cond, torch.zeros_like(target_cond)]),
+        ref_cond=torch.cat([ref_cond, torch.zeros_like(ref_cond)]),
+        nvs=torch.cat([nvs, 2 * torch.ones_like(nvs)]),
+        cfg_scale=cfg,
+    )
+    samples, _ = diffusion.p_sample_loop(
+        model.forward_with_cfg,
+        z.shape,
+        z,
+        clip_denoised=False,
+        model_kwargs=model_kwargs,
+        progress=True,
+        device="cuda",
+    ).chunk(2)
+    sampled_images = autoencoder.decode(samples / opts.latent_scaling_factor)
+    sampled_images = torch.clamp(sampled_images, min=-1.0, max=1.0)
+    sampled_images = unnormalize(sampled_images.permute(0, 2, 3, 1).cpu().numpy())
+    results = []
+    results_pose = []
+    for i in range(MAX_N):
+        if i < num_gen:
+            results.append(sampled_images[i])
+            results_pose.append(visualize_hand(target_keypts, sampled_images[i]))
+        else:
+            results.append(placeholder)
+            results_pose.append(placeholder)
+    return results, results_pose
+def ready_sample(img_ori, inpaint_mask, keypts):
+    img = cv2.resize(img_ori[..., :3], opts.image_size, interpolation=cv2.INTER_AREA)
+    sam_predictor.set_image(img)
+    if len(keypts[0]) == 0:
+        keypts[0] = np.zeros((21, 2))
+    elif len(keypts[0]) == 21:
+        keypts[0] = np.array(keypts[0], dtype=np.float32)
+    else:
+        gr.Info("Number of right hand keypoints should be either 0 or 21.")
+        return None, None
+    if len(keypts[1]) == 0:
+        keypts[1] = np.zeros((21, 2))
+    elif len(keypts[1]) == 21:
+        keypts[1] = np.array(keypts[1], dtype=np.float32)
+    else:
+        gr.Info("Number of left hand keypoints should be either 0 or 21.")
+        return None, None
+    keypts = np.concatenate(keypts, axis=0)
+    keypts = scale_keypoint(keypts, (LENGTH, LENGTH), opts.image_size)
+    # if keypts[0].sum() != 0 and keypts[21].sum() != 0:
+    #     input_point = np.array([keypts[0], keypts[21]])
+    #     # input_point = keypts
+    #     input_label = np.array([1, 1])
+    #     # input_label = np.ones_like(input_point[:, 0])
+    # elif keypts[0].sum() != 0:
+    #     input_point = np.array(keypts[:1])
+    #     # input_point = keypts[:21]
+    #     input_label = np.array([1])
+    #     # input_label = np.ones_like(input_point[:21, 0])
+    # elif keypts[21].sum() != 0:
+    #     input_point = np.array(keypts[21:22])
+    #     # input_point = keypts[21:]
+    #     input_label = np.array([1])
+    #     # input_label = np.ones_like(input_point[21:, 0])
+    box_shift_ratio = 0.5
+    box_size_factor = 1.2
+    if keypts[0].sum() != 0 and keypts[21].sum() != 0:
+        input_point = np.array(keypts)
+        input_box = np.stack([keypts.min(axis=0), keypts.max(axis=0)])
+    elif keypts[0].sum() != 0:
+        input_point = np.array(keypts[:21])
+        input_box = np.stack([keypts[:21].min(axis=0), keypts[:21].max(axis=0)])
+    elif keypts[21].sum() != 0:
+        input_point = np.array(keypts[21:])
+        input_box = np.stack([keypts[21:].min(axis=0), keypts[21:].max(axis=0)])
+    else:
+        raise ValueError(
+            "Something wrong. If no hand detected, it should not reach here."
+        )
+    input_label = np.ones_like(input_point[:, 0]).astype(np.int32)
+    box_trans = input_box[0] * box_shift_ratio + input_box[1] * (1 - box_shift_ratio)
+    input_box = ((input_box - box_trans) * box_size_factor + box_trans).reshape(-1)
+    masks, _, _ = sam_predictor.predict(
+        point_coords=input_point,
+        point_labels=input_label,
+        box=input_box[None, :],
+        multimask_output=False,
+    )
+    hand_mask = masks[0]
+    inpaint_latent_mask = torch.tensor(
+        cv2.resize(
+            inpaint_mask, dsize=opts.latent_size, interpolation=cv2.INTER_NEAREST
+        ),
+        dtype=torch.float,
+        device="cuda",
+    ).unsqueeze(0)[None, ...]
+    def make_ref_cond(
+        img,
+        keypts,
+        hand_mask,
+        device="cuda",
+        target_size=(256, 256),
+        latent_size=(32, 32),
+    ):
+        image_transform = Compose(
+            [
+                ToTensor(),
+                Resize(target_size),
+                Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+        image = image_transform(img).to(device)
+        kpts_valid = check_keypoints_validity(keypts, target_size)
+        heatmaps = torch.tensor(
+            keypoint_heatmap(
+                scale_keypoint(keypts, target_size, latent_size), latent_size, var=1.0
+            )
+            * kpts_valid[:, None, None],
+            dtype=torch.float,
+            device=device,
+        )[None, ...]
+        mask = torch.tensor(
+            cv2.resize(
+                hand_mask.astype(int),
+                dsize=latent_size,
+                interpolation=cv2.INTER_NEAREST,
+            ),
+            dtype=torch.float,
+            device=device,
+        ).unsqueeze(0)[None, ...]
+        return image[None, ...], heatmaps, mask
+    image, heatmaps, mask = make_ref_cond(
+        img,
+        keypts,
+        hand_mask * (1 - inpaint_mask),
+        device="cuda",
+        target_size=opts.image_size,
+        latent_size=opts.latent_size,
+    )
+    latent = opts.latent_scaling_factor * autoencoder.encode(image).sample()
+    target_cond = torch.cat([heatmaps, torch.zeros_like(mask)], 1)
+    ref_cond = torch.cat([latent, heatmaps, mask], 1)
+    ref_cond = torch.zeros_like(ref_cond)
+    img32 = cv2.resize(img, opts.latent_size, interpolation=cv2.INTER_NEAREST)
+    assert mask.max() == 1
+    vis_mask32 = mask_image(
+        img32, inpaint_latent_mask[0,0].cpu().numpy(), (255,255,255), transparent=False
+    ).astype(np.uint8) # 1.0 - mask[0, 0].cpu().numpy()
+    assert np.unique(inpaint_mask).shape[0] <= 2
+    assert hand_mask.dtype == bool
+    mask256 = inpaint_mask # hand_mask * (1 - inpaint_mask)
+    vis_mask256 = mask_image(img, mask256, (255,255,255), transparent=False).astype(
+        np.uint8
+    ) # 1 - mask256
+    return (
+        ref_cond,
+        target_cond,
+        latent,
+        inpaint_latent_mask,
+        keypts,
+        vis_mask32,
+        vis_mask256,
+    )
+def switch_mask_size(radio):
+    if radio == "256x256":
+        out = (gr.update(visible=False), gr.update(visible=True))
+    elif radio == "latent size (32x32)":
+        out = (gr.update(visible=True), gr.update(visible=False))
+    return out
+def sample_inpaint(
+    ref_cond,
+    target_cond,
+    latent,
+    inpaint_latent_mask,
+    keypts,
+    num_gen,
+    seed,
+    cfg,
+    quality,
+):
+    set_seed(seed)
+    N = num_gen
+    jump_length = 10
+    jump_n_sample = quality
+    cfg_scale = cfg
+    z = torch.randn(
+        (N, opts.latent_dim, opts.latent_size[0], opts.latent_size[1]), device="cuda"
+    )
+    target_cond_N = target_cond.repeat(N, 1, 1, 1)
+    ref_cond_N = ref_cond.repeat(N, 1, 1, 1)
+    # novel view synthesis mode = off
+    nvs = torch.zeros(N, dtype=torch.int, device="cuda")
+    z = torch.cat([z, z], 0)
+    model_kwargs = dict(
+        target_cond=torch.cat([target_cond_N, torch.zeros_like(target_cond_N)]),
+        ref_cond=torch.cat([ref_cond_N, torch.zeros_like(ref_cond_N)]),
+        nvs=torch.cat([nvs, 2 * torch.ones_like(nvs)]),
+        cfg_scale=cfg_scale,
+    )
+    samples, _ = diffusion.inpaint_p_sample_loop(
+        model.forward_with_cfg,
+        z.shape,
+        latent,
+        inpaint_latent_mask,
+        z,
+        clip_denoised=False,
+        model_kwargs=model_kwargs,
+        progress=True,
+        device="cuda",
+        jump_length=jump_length,
+        jump_n_sample=jump_n_sample,
+    ).chunk(2)
+    sampled_images = autoencoder.decode(samples / opts.latent_scaling_factor)
+    sampled_images = torch.clamp(sampled_images, min=-1.0, max=1.0)
+    sampled_images = unnormalize(sampled_images.permute(0, 2, 3, 1).cpu().numpy())
+    # visualize
+    results = []
+    results_pose = []
+    for i in range(FIX_MAX_N):
+        if i < num_gen:
+            results.append(sampled_images[i])
+            results_pose.append(visualize_hand(keypts, sampled_images[i]))
+        else:
+            results.append(placeholder)
+            results_pose.append(placeholder)
+    return results, results_pose
+def flip_hand(
+    img, pose_img, cond: Optional[torch.Tensor], keypts: Optional[torch.Tensor] = None
+):
+    if cond is None:  # clear clicked
+        return None, None, None, None
+    img["composite"] = img["composite"][:, ::-1, :]
+    img["background"] = img["background"][:, ::-1, :]
+    img["layers"] = [layer[:, ::-1, :] for layer in img["layers"]]
+    pose_img = pose_img[:, ::-1, :]
+    cond = cond.flip(-1)
+    if keypts is not None:  # cond is target_cond
+        if keypts[:21, :].sum() != 0:
+            keypts[:21, 0] = opts.image_size[1] - keypts[:21, 0]
+            # keypts[:21, 1] = opts.image_size[0] - keypts[:21, 1]
+        if keypts[21:, :].sum() != 0:
+            keypts[21:, 0] = opts.image_size[1] - keypts[21:, 0]
+            # keypts[21:, 1] = opts.image_size[0] - keypts[21:, 1]
+    return img, pose_img, cond, keypts
+def resize_to_full(img):
+    img["background"] = cv2.resize(img["background"], (LENGTH, LENGTH))
+    img["composite"] = cv2.resize(img["composite"], (LENGTH, LENGTH))
+    img["layers"] = [cv2.resize(layer, (LENGTH, LENGTH)) for layer in img["layers"]]
+    return img
+def clear_all():
+    return (
+        None,
+        None,
+        False,
+        None,
+        None,
+        False,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        1,
+        42,
+        3.0,
+    )
+def fix_clear_all():
+    return (
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        1,
+        # (0,0),
+        42,
+        3.0,
+        10,
+    )
+def enable_component(image1, image2):
+    if image1 is None or image2 is None:
+        return gr.update(interactive=False)
+    if "background" in image1 and "layers" in image1 and "composite" in image1:
+        if (
+            image1["background"].sum() == 0
+            and (sum([im.sum() for im in image1["layers"]]) == 0)
+            and image1["composite"].sum() == 0
+        ):
+            return gr.update(interactive=False)
+    if "background" in image2 and "layers" in image2 and "composite" in image2:
+        if (
+            image2["background"].sum() == 0
+            and (sum([im.sum() for im in image2["layers"]]) == 0)
+            and image2["composite"].sum() == 0
+        ):
+            return gr.update(interactive=False)
+    return gr.update(interactive=True)
+def set_visible(checkbox, kpts, img_clean, img_pose_right, img_pose_left):
+    if kpts is None:
+        kpts = [[], []]
+    if "Right hand" not in checkbox:
+        kpts[0] = []
+        vis_right = img_clean
+        update_right = gr.update(visible=False)
+        update_r_info = gr.update(visible=False)
+    else:
+        vis_right = img_pose_right
+        update_right = gr.update(visible=True)
+        update_r_info = gr.update(visible=True)
+    if "Left hand" not in checkbox:
+        kpts[1] = []
+        vis_left = img_clean
+        update_left = gr.update(visible=False)
+        update_l_info = gr.update(visible=False)
+    else:
+        vis_left = img_pose_left
+        update_left = gr.update(visible=True)
+        update_l_info = gr.update(visible=True)
+    return (
+        kpts,
+        vis_right,
+        vis_left,
+        update_right,
+        update_right,
+        update_right,
+        update_left,
+        update_left,
+        update_left,
+        update_r_info,
+        update_l_info,
+    )
+# def parse_fix_example(ex_img, ex_masked):
+#     original_img = ex_img
+#     # ex_img = cv2.resize(ex_img, (LENGTH, LENGTH), interpolation=cv2.INTER_AREA)
+#     # ex_masked = cv2.resize(ex_masked, (LENGTH, LENGTH), interpolation=cv2.INTER_AREA)
+#     inpaint_mask = np.all(ex_masked > 250, axis=-1).astype(np.uint8)
+#     layer = np.ones_like(ex_img) * 255
+#     layer = np.concatenate([layer, np.zeros_like(ex_img[..., 0:1])], axis=-1)
+#     layer[inpaint_mask == 1, 3] = 255
+#     ref_value = {
+#         "composite": ex_masked,
+#         "background": ex_img,
+#         "layers": [layer],
+#     }
+#     inpaint_mask = cv2.resize(
+#         inpaint_mask, opts.image_size, interpolation=cv2.INTER_AREA
+#     )
+#     kp_img = visualize_ref(ref_value)
+#     return (
+#         original_img,
+#         gr.update(value=ref_value),
+#         kp_img,
+#         inpaint_mask,
+#     )
+LENGTH = 480
+example_imgs = [
+    [
+        "sample_images/sample1.jpg",
+    ],
+    [
+        "sample_images/sample2.jpg",
+    ],
+    [
+        "sample_images/sample3.jpg",
+    ],
+    [
+        "sample_images/sample4.jpg",
+    ],
+    [
+        "sample_images/sample5.jpg",
+    ],
+    [
+        "sample_images/sample6.jpg",
+    ],
+    [
+        "sample_images/sample7.jpg",
+    ],
+    [
+        "sample_images/sample8.jpg",
+    ],
+    [
+        "sample_images/sample9.jpg",
+    ],
+    [
+        "sample_images/sample10.jpg",
+    ],
+    [
+        "sample_images/sample11.jpg",
+    ],
+    ["pose_images/pose1.jpg"],
+    ["pose_images/pose2.jpg"],
+    ["pose_images/pose3.jpg"],
+    ["pose_images/pose4.jpg"],
+    ["pose_images/pose5.jpg"],
+    ["pose_images/pose6.jpg"],
+    ["pose_images/pose7.jpg"],
+    ["pose_images/pose8.jpg"],
+]
+fix_example_imgs = [
+    ["bad_hands/1.jpg"],  # "bad_hands/1_mask.jpg"],
+    ["bad_hands/2.jpg"],  # "bad_hands/2_mask.jpg"],
+    ["bad_hands/3.jpg"],  # "bad_hands/3_mask.jpg"],
+    ["bad_hands/4.jpg"],  # "bad_hands/4_mask.jpg"],
+    ["bad_hands/5.jpg"],  # "bad_hands/5_mask.jpg"],
+    ["bad_hands/6.jpg"],  # "bad_hands/6_mask.jpg"],
+    ["bad_hands/7.jpg"],  # "bad_hands/7_mask.jpg"],
+    ["bad_hands/8.jpg"],  # "bad_hands/8_mask.jpg"],
+    ["bad_hands/9.jpg"],  # "bad_hands/9_mask.jpg"],
+    ["bad_hands/10.jpg"],  # "bad_hands/10_mask.jpg"],
+    ["bad_hands/11.jpg"],  # "bad_hands/11_mask.jpg"],
+    ["bad_hands/12.jpg"],  # "bad_hands/12_mask.jpg"],
+    ["bad_hands/13.jpg"],  # "bad_hands/13_mask.jpg"],
+]
+custom_css = """
+.gradio-container .examples img {
+    width: 240px !important;
+    height: 240px !important;
+}
+"""
+with gr.Blocks(css=custom_css) as demo:
+    with gr.Tab("Edit Hand Poses"):
+        ref_img = gr.State(value=None)
+        ref_cond = gr.State(value=None)
+        keypts = gr.State(value=None)
+        target_img = gr.State(value=None)
+        target_cond = gr.State(value=None)
+        target_keypts = gr.State(value=None)
+        dump = gr.State(value=None)
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 25px; font-weight: bold; ">1. Reference</p>"""
+                )
+                gr.Markdown("""<p style="text-align: center;"><br></p>""")
+                ref = gr.ImageEditor(
+                    type="numpy",
+                    label="Reference",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    brush=False,
+                    layers=False,
+                    crop_size="1:1",
+                )
+                ref_finish_crop = gr.Button(value="Finish Cropping", interactive=False)
+                ref_pose = gr.Image(
+                    type="numpy",
+                    label="Reference Pose",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    interactive=False,
+                )
+                ref_flip = gr.Checkbox(
+                    value=False, label="Flip Handedness (Reference)", interactive=False
+                )
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 25px; font-weight: bold;">2. Target</p>"""
+                )
+                target = gr.ImageEditor(
+                    type="numpy",
+                    label="Target",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    brush=False,
+                    layers=False,
+                    crop_size="1:1",
+                )
+                target_finish_crop = gr.Button(
+                    value="Finish Cropping", interactive=False
+                )
+                target_pose = gr.Image(
+                    type="numpy",
+                    label="Target Pose",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    interactive=False,
+                )
+                target_flip = gr.Checkbox(
+                    value=False, label="Flip Handedness (Target)", interactive=False
+                )
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 25px; font-weight: bold;">3. Result</p>"""
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">Run is enabled after the images have been processed</p>"""
+                )
+                run = gr.Button(value="Run", interactive=False)
+                gr.Markdown(
+                    """<p style="text-align: center;">~20s per generation. <br>(For example, if you set Number of generations as 2, it would take around 40s)</p>"""
+                )
+                results = gr.Gallery(
+                    type="numpy",
+                    label="Results",
+                    show_label=True,
+                    height=LENGTH,
+                    min_width=LENGTH,
+                    columns=MAX_N,
+                    interactive=False,
+                    preview=True,
+                )
+                results_pose = gr.Gallery(
+                    type="numpy",
+                    label="Results Pose",
+                    show_label=True,
+                    height=LENGTH,
+                    min_width=LENGTH,
+                    columns=MAX_N,
+                    interactive=False,
+                    preview=True,
+                )
+                clear = gr.ClearButton()
+        with gr.Row():
+            n_generation = gr.Slider(
+                label="Number of generations",
+                value=1,
+                minimum=1,
+                maximum=MAX_N,
+                step=1,
+                randomize=False,
+                interactive=True,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                value=42,
+                minimum=0,
+                maximum=10000,
+                step=1,
+                randomize=False,
+                interactive=True,
+            )
+            cfg = gr.Slider(
+                label="Classifier free guidance scale",
+                value=2.5,
+                minimum=0.0,
+                maximum=10.0,
+                step=0.1,
+                randomize=False,
+                interactive=True,
+            )
+        ref.change(enable_component, [ref, ref], ref_finish_crop)
+        ref_finish_crop.click(get_ref_anno, [ref], [ref_img, ref_pose, ref_cond])
+        ref_pose.change(enable_component, [ref_img, ref_pose], ref_flip)
+        ref_flip.select(
+            flip_hand, [ref, ref_pose, ref_cond], [ref, ref_pose, ref_cond, dump]
+        )
+        target.change(enable_component, [target, target], target_finish_crop)
+        target_finish_crop.click(
+            get_target_anno,
+            [target],
+            [target_img, target_pose, target_cond, target_keypts],
+        )
+        target_pose.change(enable_component, [target_img, target_pose], target_flip)
+        target_flip.select(
+            flip_hand,
+            [target, target_pose, target_cond, target_keypts],
+            [target, target_pose, target_cond, target_keypts],
+        )
+        ref_pose.change(enable_component, [ref_pose, target_pose], run)
+        target_pose.change(enable_component, [ref_pose, target_pose], run)
+        run.click(
+            sample_diff,
+            [ref_cond, target_cond, target_keypts, n_generation, seed, cfg],
+            [results, results_pose],
+        )
+        clear.click(
+            clear_all,
+            [],
+            [
+                ref,
+                ref_pose,
+                ref_flip,
+                target,
+                target_pose,
+                target_flip,
+                results,
+                results_pose,
+                ref_img,
+                ref_cond,
+                # mask,
+                target_img,
+                target_cond,
+                target_keypts,
+                n_generation,
+                seed,
+                cfg,
+            ],
+        )
+        gr.Markdown("""<p style="font-size: 25px; font-weight: bold;">Examples</p>""")
+        with gr.Tab("Reference"):
+            with gr.Row():
+                gr.Examples(example_imgs, [ref], examples_per_page=20)
+        with gr.Tab("Target"):
+            with gr.Row():
+                gr.Examples(example_imgs, [target], examples_per_page=20)
+    with gr.Tab("Fix Hands"):
+        fix_inpaint_mask = gr.State(value=None)
+        fix_original = gr.State(value=None)
+        fix_img = gr.State(value=None)
+        fix_kpts = gr.State(value=None)
+        fix_kpts_np = gr.State(value=None)
+        fix_ref_cond = gr.State(value=None)
+        fix_target_cond = gr.State(value=None)
+        fix_latent = gr.State(value=None)
+        fix_inpaint_latent = gr.State(value=None)
+        # fix_size_memory = gr.State(value=(0, 0))
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 25px; font-weight: bold; ">1. Image Cropping & Brushing</p>"""
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">Crop the image around the hand.<br>Then, brush area (e.g., wrong finger) that needs to be fixed.</p>"""
+                )
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold; ">A. Crop</p>"""
+                )
+                fix_crop = gr.ImageEditor(
+                    type="numpy",
+                    sources=["upload", "webcam", "clipboard"],
+                    label="Image crop",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    layers=False,
+                    crop_size="1:1",
+                    brush=False,
+                    image_mode="RGBA",
+                    container=False,
+                )
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold; ">B. Brush</p>"""
+                )
+                fix_ref = gr.ImageEditor(
+                    type="numpy",
+                    label="Image brush",
+                    sources=(),
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    layers=False,
+                    transforms=("brush"),
+                    brush=gr.Brush(
+                        colors=["rgb(255, 255, 255)"], default_size=20
+                    ),  # 204, 50, 50
+                    image_mode="RGBA",
+                    container=False,
+                    interactive=False,
+                )
+                fix_finish_crop = gr.Button(
+                    value="Finish Croping & Brushing", interactive=False
+                )
+                gr.Markdown(
+                    """<p style="text-align: left; font-size: 20px; font-weight: bold; ">OpenPose keypoints convention</p>"""
+                )
+                fix_openpose = gr.Image(
+                    value="openpose.png",
+                    type="numpy",
+                    label="OpenPose keypoints convention",
+                    show_label=True,
+                    height=LENGTH // 3 * 2,
+                    width=LENGTH // 3 * 2,
+                    interactive=False,
+                )
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 25px; font-weight: bold; ">2. Keypoint Selection</p>"""
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">On the hand, select 21 keypoints that you hope the output to be. <br>Please see the \"OpenPose keypoints convention\" on the bottom left.</p>"""
+                )
+                fix_checkbox = gr.CheckboxGroup(
+                    ["Right hand", "Left hand"],
+                    # value=["Right hand", "Left hand"],
+                    label="Hand side",
+                    info="Which side this hand is? Could be both.",
+                    interactive=False,
+                )
+                fix_kp_r_info = gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold; ">Select right only</p>""",
+                    visible=False,
+                )
+                fix_kp_right = gr.Image(
+                    type="numpy",
+                    label="Keypoint Selection (right hand)",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    interactive=False,
+                    visible=False,
+                    sources=[],
+                )
+                with gr.Row():
+                    fix_undo_right = gr.Button(
+                        value="Undo", interactive=False, visible=False
+                    )
+                    fix_reset_right = gr.Button(
+                        value="Reset", interactive=False, visible=False
+                    )
+                fix_kp_l_info = gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold; ">Select left only</p>""",
+                    visible=False
+                )
+                fix_kp_left = gr.Image(
+                    type="numpy",
+                    label="Keypoint Selection (left hand)",
+                    show_label=True,
+                    height=LENGTH,
+                    width=LENGTH,
+                    interactive=False,
+                    visible=False,
+                    sources=[],
+                )
+                with gr.Row():
+                    fix_undo_left = gr.Button(
+                        value="Undo", interactive=False, visible=False
+                    )
+                    fix_reset_left = gr.Button(
+                        value="Reset", interactive=False, visible=False
+                    )
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 25px; font-weight: bold; ">3. Prepare Mask</p>"""
+                )
+                gr.Markdown(
+                    """<p style="text-align: center;">In Fix Hands, not segmentation mask, but only inpaint mask is used.</p>"""
+                )
+                fix_ready = gr.Button(value="Ready", interactive=False)
+                fix_mask_size = gr.Radio(
+                    ["256x256", "latent size (32x32)"],
+                    label="Visualized inpaint mask size",
+                    interactive=False,
+                    value="256x256",
+                )
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 20px; font-weight: bold; ">Visualized inpaint masks</p>"""
+                )
+                fix_vis_mask32 = gr.Image(
+                    type="numpy",
+                    label=f"Visualized {opts.latent_size} Inpaint Mask",
+                    show_label=True,
+                    height=opts.latent_size,
+                    width=opts.latent_size,
+                    interactive=False,
+                    visible=False,
+                )
+                fix_vis_mask256 = gr.Image(
+                    type="numpy",
+                    label=f"Visualized {opts.image_size} Inpaint Mask",
+                    visible=True,
+                    show_label=True,
+                    height=opts.image_size,
+                    width=opts.image_size,
+                    interactive=False,
+                )
+            with gr.Column():
+                gr.Markdown(
+                    """<p style="text-align: center; font-size: 25px; font-weight: bold; ">4. Results</p>"""
+                )
+                fix_run = gr.Button(value="Run", interactive=False)
+                gr.Markdown(
+                    """<p style="text-align: center;">>3min and ~24GB per generation</p>"""
+                )
+                fix_result = gr.Gallery(
+                    type="numpy",
+                    label="Results",
+                    show_label=True,
+                    height=LENGTH,
+                    min_width=LENGTH,
+                    columns=FIX_MAX_N,
+                    interactive=False,
+                    preview=True,
+                )
+                fix_result_pose = gr.Gallery(
+                    type="numpy",
+                    label="Results Pose",
+                    show_label=True,
+                    height=LENGTH,
+                    min_width=LENGTH,
+                    columns=FIX_MAX_N,
+                    interactive=False,
+                    preview=True,
+                )
+                fix_clear = gr.ClearButton()
+        gr.Markdown(
+            "[NOTE] Currently, Number of generation > 1 could lead to out-of-memory"
+        )
+        with gr.Row():
+            fix_n_generation = gr.Slider(
+                label="Number of generations",
+                value=1,
+                minimum=1,
+                maximum=FIX_MAX_N,
+                step=1,
+                randomize=False,
+                interactive=True,
+            )
+            fix_seed = gr.Slider(
+                label="Seed",
+                value=42,
+                minimum=0,
+                maximum=10000,
+                step=1,
+                randomize=False,
+                interactive=True,
+            )
+            fix_cfg = gr.Slider(
+                label="Classifier free guidance scale",
+                value=3.0,
+                minimum=0.0,
+                maximum=10.0,
+                step=0.1,
+                randomize=False,
+                interactive=True,
+            )
+            fix_quality = gr.Slider(
+                label="Quality",
+                value=10,
+                minimum=1,
+                maximum=10,
+                step=1,
+                randomize=False,
+                interactive=True,
+            )
+        fix_crop.change(enable_component, [fix_crop, fix_crop], fix_ref)
+        fix_crop.change(resize_to_full, fix_crop, fix_ref)
+        fix_ref.change(enable_component, [fix_ref, fix_ref], fix_finish_crop)
+        fix_finish_crop.click(get_mask_inpaint, [fix_ref], [fix_inpaint_mask])
+        # fix_finish_crop.click(lambda x: x["background"], [fix_ref], [fix_kp_right])
+        # fix_finish_crop.click(lambda x: x["background"], [fix_ref], [fix_kp_left])
+        fix_finish_crop.click(lambda x: x["background"], [fix_crop], [fix_original])
+        fix_finish_crop.click(visualize_ref, [fix_crop, fix_ref], [fix_img])
+        fix_img.change(lambda x: x, [fix_img], [fix_kp_right])
+        fix_img.change(lambda x: x, [fix_img], [fix_kp_left])
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_checkbox
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_right
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_right
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_right
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_kp_left
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_undo_left
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_reset_left
+        )
+        fix_inpaint_mask.change(
+            enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_ready
+        )
+        # fix_inpaint_mask.change(
+        #     enable_component, [fix_inpaint_mask, fix_inpaint_mask], fix_run
+        # )
+        fix_checkbox.select(
+            set_visible,
+            [fix_checkbox, fix_kpts, fix_img, fix_kp_right, fix_kp_left],
+            [
+                fix_kpts,
+                fix_kp_right,
+                fix_kp_left,
+                fix_kp_right,
+                fix_undo_right,
+                fix_reset_right,
+                fix_kp_left,
+                fix_undo_left,
+                fix_reset_left,
+                fix_kp_r_info,
+                fix_kp_l_info,
+            ],
+        )
+        fix_kp_right.select(
+            get_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts]
+        )
+        fix_undo_right.click(
+            undo_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts]
+        )
+        fix_reset_right.click(
+            reset_kps, [fix_img, fix_kpts, gr.State("right")], [fix_kp_right, fix_kpts]
+        )
+        fix_kp_left.select(
+            get_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts]
+        )
+        fix_undo_left.click(
+            undo_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts]
+        )
+        fix_reset_left.click(
+            reset_kps, [fix_img, fix_kpts, gr.State("left")], [fix_kp_left, fix_kpts]
+        )
+        # fix_kpts.change(check_keypoints, [fix_kpts], [fix_kp_right, fix_kp_left, fix_run])
+        # fix_run.click(lambda x:gr.update(value=None), [], [fix_result, fix_result_pose])
+        fix_vis_mask32.change(
+            enable_component, [fix_vis_mask32, fix_vis_mask256], fix_run
+        )
+        fix_vis_mask32.change(
+            enable_component, [fix_vis_mask32, fix_vis_mask256], fix_mask_size
+        )
+        fix_ready.click(
+            ready_sample,
+            [fix_original, fix_inpaint_mask, fix_kpts],
+            [
+                fix_ref_cond,
+                fix_target_cond,
+                fix_latent,
+                fix_inpaint_latent,
+                fix_kpts_np,
+                fix_vis_mask32,
+                fix_vis_mask256,
+            ],
+        )
+        fix_mask_size.select(
+            switch_mask_size, [fix_mask_size], [fix_vis_mask32, fix_vis_mask256]
+        )
+        fix_run.click(
+            sample_inpaint,
+            [
+                fix_ref_cond,
+                fix_target_cond,
+                fix_latent,
+                fix_inpaint_latent,
+                fix_kpts_np,
+                fix_n_generation,
+                fix_seed,
+                fix_cfg,
+                fix_quality,
+            ],
+            [fix_result, fix_result_pose],
+        )
+        fix_clear.click(
+            fix_clear_all,
+            [],
+            [
+                fix_crop,
+                fix_ref,
+                fix_kp_right,
+                fix_kp_left,
+                fix_result,
+                fix_result_pose,
+                fix_inpaint_mask,
+                fix_original,
+                fix_img,
+                fix_vis_mask32,
+                fix_vis_mask256,
+                fix_kpts,
+                fix_kpts_np,
+                fix_ref_cond,
+                fix_target_cond,
+                fix_latent,
+                fix_inpaint_latent,
+                fix_n_generation,
+                # fix_size_memory,
+                fix_seed,
+                fix_cfg,
+                fix_quality,
+            ],
+        )
+        gr.Markdown("""<p style="font-size: 25px; font-weight: bold;">Examples</p>""")
+        fix_dump_ex = gr.Image(value=None, label="Original Image", visible=False)
+        fix_dump_ex_masked = gr.Image(value=None, label="After Brushing", visible=False)
+        with gr.Column():
+            fix_example = gr.Examples(
+                fix_example_imgs,
+                # run_on_click=True,
+                # fn=parse_fix_example,
+                # inputs=[fix_dump_ex, fix_dump_ex_masked],
+                # outputs=[fix_original, fix_ref, fix_img, fix_inpaint_mask],
+                inputs=[fix_crop],
+                examples_per_page=20,
+            )
+print("Ready to launch..")
+_, _, shared_url = demo.queue().launch(
+    share=True, server_name="0.0.0.0", server_port=7739
+)
+demo.block()

diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type
+        # rescale_timesteps=rescale_timesteps,
+    )

diffusion/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (986 Bytes). View file

diffusion/__pycache__/diffusion_utils.cpython-38.pyc ADDED Viewed

Binary file (2.86 kB). View file

diffusion/__pycache__/gaussian_diffusion.cpython-38.pyc ADDED Viewed

Binary file (27.6 kB). View file

diffusion/__pycache__/respace.cpython-38.pyc ADDED Viewed

Binary file (5.04 kB). View file

diffusion/__pycache__/scheduler.cpython-38.pyc ADDED Viewed

Binary file (3.99 kB). View file

diffusion/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import torch as th
+import numpy as np
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a continuous Gaussian distribution.
+    :param x: the targets
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
+    return log_probs
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,1118 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import math
+import numpy as np
+import torch as th
+import enum
+from collections import defaultdict
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+from .scheduler import get_schedule_jump
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def inpaint_p_sample_loop(
+        self,
+        model,
+        shape,
+        x0,
+        mask,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        jump_length=10,
+        jump_n_sample=10,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.inpaint_p_sample_loop_progressive(
+            model,
+            shape,
+            x0,
+            mask,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            jump_length=jump_length,
+            jump_n_sample=jump_n_sample,
+        ):
+            final = sample
+        return final["sample"]
+    def inpaint_p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        x0,
+        mask,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        jump_length=10,
+        jump_n_sample=10,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        # if device is None:
+        #     device = next(model.parameters()).device
+        # assert isinstance(shape, (tuple, list))
+        # if noise is not None:
+        #     img = noise
+        # else:
+        #     img = th.randn(*shape, device=device)
+        # indices = list(range(self.num_timesteps))[::-1]
+        # if progress:
+        #     # Lazy import so that we don't depend on tqdm.
+        #     from tqdm.auto import tqdm
+        #     indices = tqdm(indices)
+        # pred_xstart = None
+        # for i in indices:
+        #     t = th.tensor([i] * shape[0], device=device)
+        #     with th.no_grad():
+        #         out = self.inpaint_p_sample(
+        #             model,
+        #             img,
+        #             t,
+        #             x0,
+        #             mask,
+        #             clip_denoised=clip_denoised,
+        #             denoised_fn=denoised_fn,
+        #             cond_fn=cond_fn,
+        #             model_kwargs=model_kwargs,
+        #             pred_xstart=pred_xstart,
+        #         )
+        #         yield out
+        #         img = out["sample"]
+        #         pred_xstart = out["pred_xstart"]
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            image_after_step = noise
+        else:
+            image_after_step = th.randn(*shape, device=device)
+        self.gt_noises = None  # reset for next image
+        pred_xstart = None
+        idx_wall = -1
+        sample_idxs = defaultdict(lambda: 0)
+        times = get_schedule_jump(t_T=250, n_sample=1, jump_length=jump_length, jump_n_sample=jump_n_sample)
+        time_pairs = list(zip(times[:-1], times[1:]))
+        if progress:
+            from tqdm.auto import tqdm
+            time_pairs = tqdm(time_pairs)
+        for t_last, t_cur in time_pairs:
+            idx_wall += 1
+            t_last_t = th.tensor([t_last] * shape[0],  # pylint: disable=not-callable
+                                    device=device)
+            if t_cur < t_last:  # reverse
+                with th.no_grad():
+                    image_before_step = image_after_step.clone()
+                    out = self.inpaint_p_sample(
+                        model,
+                        image_after_step,
+                        t_last_t,
+                        x0,
+                        mask,
+                        clip_denoised=clip_denoised,
+                        denoised_fn=denoised_fn,
+                        cond_fn=cond_fn,
+                        model_kwargs=model_kwargs,
+                        pred_xstart=pred_xstart
+                    )
+                    image_after_step = out["sample"]
+                    pred_xstart = out["pred_xstart"]
+                    sample_idxs[t_cur] += 1
+                    yield out
+            else:
+                t_shift = 1
+                image_before_step = image_after_step.clone()
+                image_after_step = self.undo(
+                    image_before_step, image_after_step,
+                    est_x_0=out['pred_xstart'], t=t_last_t+t_shift, debug=False)
+                pred_xstart = out["pred_xstart"]
+    def inpaint_p_sample(
+        self,
+        model,
+        x,
+        t,
+        x0,
+        mask,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        pred_xstart=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        noise = th.randn_like(x)
+        if pred_xstart is not None:
+            alpha_cumprod = _extract_into_tensor(
+                self.alphas_cumprod, t, x.shape)
+            weighed_gt = th.sqrt(alpha_cumprod) * x0 + th.sqrt((1 - alpha_cumprod)) * th.randn_like(x)
+            x = (1 - mask) * weighed_gt + mask * x
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(
+                cond_fn, out, x, t, model_kwargs=model_kwargs
+            )
+        sample = out["mean"] + nonzero_mask * \
+            th.exp(0.5 * out["log_variance"]) * noise
+        result = {"sample": sample,
+                  "pred_xstart": out["pred_xstart"], 'gt': model_kwargs.get('gt')}
+        return result
+    def undo(self, image_before_step, img_after_model, est_x_0, t, debug=False):
+        return self._undo(img_after_model, t)
+    def _undo(self, img_out, t):
+        beta = _extract_into_tensor(self.betas, t, img_out.shape)
+        img_in_est = th.sqrt(1 - beta) * img_out + \
+            th.sqrt(beta) * th.randn_like(img_out)
+        return img_in_est
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

diffusion/scheduler.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+def get_schedule(t_T, t_0, n_sample, n_steplength, debug=0):
+    if n_steplength > 1:
+        if not n_sample > 1:
+            raise RuntimeError('n_steplength has no effect if n_sample=1')
+    t = t_T
+    times = [t]
+    while t >= 0:
+        t = t - 1
+        times.append(t)
+        n_steplength_cur = min(n_steplength, t_T - t)
+        for _ in range(n_sample - 1):
+            for _ in range(n_steplength_cur):
+                t = t + 1
+                times.append(t)
+            for _ in range(n_steplength_cur):
+                t = t - 1
+                times.append(t)
+    _check_times(times, t_0, t_T)
+    if debug == 2:
+        for x in [list(range(0, 50)), list(range(-1, -50, -1))]:
+            _plot_times(x=x, times=[times[i] for i in x])
+    return times
+def _check_times(times, t_0, t_T):
+    # Check end
+    assert times[0] > times[1], (times[0], times[1])
+    # Check beginning
+    assert times[-1] == -1, times[-1]
+    # Steplength = 1
+    for t_last, t_cur in zip(times[:-1], times[1:]):
+        assert abs(t_last - t_cur) == 1, (t_last, t_cur)
+    # Value range
+    for t in times:
+        assert t >= t_0, (t, t_0)
+        assert t <= t_T, (t, t_T)
+def _plot_times(x, times):
+    import matplotlib.pyplot as plt
+    plt.plot(x, times)
+    plt.show()
+def get_schedule_jump(t_T, n_sample, jump_length, jump_n_sample,
+                      jump2_length=1, jump2_n_sample=1,
+                      jump3_length=1, jump3_n_sample=1,
+                      start_resampling=100000000):
+    jumps = {}
+    for j in range(0, t_T - jump_length, jump_length):
+        jumps[j] = jump_n_sample - 1
+    jumps2 = {}
+    for j in range(0, t_T - jump2_length, jump2_length):
+        jumps2[j] = jump2_n_sample - 1
+    jumps3 = {}
+    for j in range(0, t_T - jump3_length, jump3_length):
+        jumps3[j] = jump3_n_sample - 1
+    t = t_T
+    ts = []
+    while t >= 1:
+        t = t-1
+        ts.append(t)
+        if (
+            t + 1 < t_T - 1 and
+            t <= start_resampling
+        ):
+            for _ in range(n_sample - 1):
+                t = t + 1
+                ts.append(t)
+                if t >= 0:
+                    t = t - 1
+                    ts.append(t)
+        if (
+            jumps3.get(t, 0) > 0 and
+            t <= start_resampling - jump3_length
+        ):
+            jumps3[t] = jumps3[t] - 1
+            for _ in range(jump3_length):
+                t = t + 1
+                ts.append(t)
+        if (
+            jumps2.get(t, 0) > 0 and
+            t <= start_resampling - jump2_length
+        ):
+            jumps2[t] = jumps2[t] - 1
+            for _ in range(jump2_length):
+                t = t + 1
+                ts.append(t)
+            jumps3 = {}
+            for j in range(0, t_T - jump3_length, jump3_length):
+                jumps3[j] = jump3_n_sample - 1
+        if (
+            jumps.get(t, 0) > 0 and
+            t <= start_resampling - jump_length
+        ):
+            jumps[t] = jumps[t] - 1
+            for _ in range(jump_length):
+                t = t + 1
+                ts.append(t)
+            jumps2 = {}
+            for j in range(0, t_T - jump2_length, jump2_length):
+                jumps2[j] = jump2_n_sample - 1
+            jumps3 = {}
+            for j in range(0, t_T - jump3_length, jump3_length):
+                jumps3[j] = jump3_n_sample - 1
+    ts.append(-1)
+    _check_times(ts, -1, t_T)
+    return ts
+def get_schedule_jump_paper():
+    t_T = 250
+    jump_length = 10
+    jump_n_sample = 10
+    jumps = {}
+    for j in range(0, t_T - jump_length, jump_length):
+        jumps[j] = jump_n_sample - 1
+    t = t_T
+    ts = []
+    while t >= 1:
+        t = t-1
+        ts.append(t)
+        if jumps.get(t, 0) > 0:
+            jumps[t] = jumps[t] - 1
+            for _ in range(jump_length):
+                t = t + 1
+                ts.append(t)
+    ts.append(-1)
+    _check_times(ts, -1, t_T)
+    return ts
+def get_schedule_jump_test(to_supplement=False):
+    ts = get_schedule_jump(t_T=250, n_sample=1,
+                           jump_length=10, jump_n_sample=10,
+                           jump2_length=1, jump2_n_sample=1,
+                           jump3_length=1, jump3_n_sample=1,
+                           start_resampling=250)
+    import matplotlib.pyplot as plt
+    SMALL_SIZE = 8*3
+    MEDIUM_SIZE = 10*3
+    BIGGER_SIZE = 12*3
+    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
+    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
+    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
+    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
+    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
+    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
+    plt.plot(ts)
+    fig = plt.gcf()
+    fig.set_size_inches(20, 10)
+    ax = plt.gca()
+    ax.set_xlabel('Number of Transitions')
+    ax.set_ylabel('Diffusion time $t$')
+    fig.tight_layout()
+    if to_supplement:
+        out_path = "/cluster/home/alugmayr/gdiff/paper/supplement/figures/jump_sched.pdf"
+        plt.savefig(out_path)
+    out_path = "./schedule.png"
+    plt.savefig(out_path)
+    print(out_path)
+def main():
+    get_schedule_jump_test()
+if __name__ == "__main__":
+    main()

diffusion/timestep_sampler.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python3-opencv

requirements.txt ADDED Viewed

	@@ -0,0 +1,223 @@

+absl-py==2.1.0
+accelerate==0.34.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+albumentations==0.5.2
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.2.0
+beautifulsoup4==4.12.3
+bitsandbytes==0.44.1
+boto==2.49.0
+boto3==1.28.57
+botocore==1.34.131
+cachetools==5.5.0
+certifi==2022.12.7
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==2.1.1
+click==8.1.7
+click-default-group==1.2.4
+clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
+cmake==3.30.3
+colorlog==6.8.2
+commonmark==0.9.1
+contourpy==1.1.1
+cycler==0.12.1
+decord==0.6.0
+deepspeed==0.15.1
+diffusers==0.25.0
+docker-pycreds==0.4.0
+ego4d==1.3.2
+einops==0.8.0
+embreex==2.17.7.post5
+envlight @ git+https://github.com/ashawkey/envlight.git@05b5851e854429d72ecaf5b206ed64ce55fae677
+exceptiongroup==1.2.2
+fastapi==0.112.0
+ffmpy==0.4.0
+filelock==3.13.1
+flatbuffers==24.3.25
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.2.0
+ftfy==6.2.3
+gast==0.4.0
+gdown==5.2.0
+gevent==23.9.1
+gevent-websocket==0.10.1
+gitdb==4.0.11
+GitPython==3.1.43
+google-auth==2.35.0
+google-auth-oauthlib==1.0.0
+google-pasta==0.2.0
+gradio==4.40.0
+gradio_client==1.2.0
+greenlet==2.0.2
+grpcio==1.66.1
+h11==0.14.0
+h5py==3.11.0
+hjson==3.1.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.24.5
+idna==3.4
+imageio==2.34.2
+imageio-ffmpeg==0.5.1
+imgaug==0.4.0
+importlib_metadata==8.2.0
+importlib_resources==6.4.0
+jax==0.4.13
+jaxlib==0.4.13
+jaxtyping==0.2.19
+Jinja2==3.1.3
+jmespath==1.0.1
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+keras==2.13.1
+kiwisolver==1.4.5
+kornia==0.7.3
+kornia_rs==0.1.5
+lazy_loader==0.4
+libclang==18.1.1
+libigl==2.5.1
+lightning-utilities==0.11.8
+lit==18.1.8
+lxml==5.3.0
+manifold3d==2.5.1
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.7.5
+mdurl==0.1.2
+mediapipe==0.10.11
+ml-dtypes==0.2.0
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+nerfacc @ git+https://github.com/KAIR-BAIR/nerfacc.git@d84cdf3afd7dcfc42150e0f0506db58a5ce62812
+networkx==3.0
+ninja==1.11.1.1
+numpy==1.24.1
+nvdiffrast @ git+https://github.com/NVlabs/nvdiffrast.git@729261dc64c4241ea36efda84fbf532cc8b425b8
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+oauthlib==3.2.2
+omegaconf==2.3.0
+open-clip-torch==2.7.0
+opencv-contrib-python==4.10.0.84
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+opt-einsum==3.3.0
+orjson==3.10.6
+packaging==24.1
+pandas==2.0.3
+pillow==10.2.0
+pkgutil_resolve_name==1.3.10
+platformdirs==4.3.6
+prometheus-client==0.13.1
+propcache==0.2.0
+protobuf==3.20.3
+psutil==6.0.0
+py-cpuinfo==9.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycollada==0.8
+pycparser==2.22
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.2
+pyre-extensions==0.0.29
+pysdf==0.1.9
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytorch-lightning==2.1.0
+pytz==2024.1
+PyWavelets==1.4.1
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.7.24
+requests==2.32.3
+requests-oauthlib==2.0.0
+rich==13.7.1
+rich-click==1.6.1
+rpds-py==0.20.0
+rsa==4.9
+Rtree==1.3.0
+ruff==0.5.6
+s3transfer==0.7.0
+safetensors==0.4.3
+scikit-image==0.21.0
+scipy==1.10.1
+segment-anything==1.0
+semantic-version==2.10.0
+sentencepiece==0.1.99
+sentry-sdk==2.17.0
+setproctitle==1.3.3
+sh==1.14.3
+shapely==2.0.6
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+sounddevice==0.4.7
+soupsieve==2.6
+starlette==0.37.2
+svg.path==6.3
+sympy==1.12
+taming-transformers-rom1504==0.0.6
+tensorboard==2.13.0
+tensorboard-data-server==0.7.2
+tensorflow==2.13.1
+tensorflow-estimator==2.13.0
+tensorflow-io-gcs-filesystem==0.34.0
+termcolor==2.4.0
+tifffile==2023.7.10
+timm==0.9.12
+tinycudann @ git+https://github.com/NVlabs/tiny-cuda-nn/@c91138bcd4c6877c8d5e60e483c0581aafc70cce#subdirectory=bindings/torch
+tokenizers==0.20.0
+tomlkit==0.12.0
+torch==2.0.1+cu118
+torchaudio==2.0.2+cu118
+torchmetrics==1.5.0
+torchvision==0.15.2+cu118
+tqdm==4.66.4
+transformers==4.45.1
+trimesh==4.5.0
+triton==2.0.0
+typeguard==4.3.0
+typer==0.12.3
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.3
+uvicorn==0.30.5
+wandb==0.18.5
+wcwidth==0.2.13
+websockets==10.4
+Werkzeug==3.0.4
+wrapt==1.16.0
+xatlas==0.0.9
+xformers==0.0.20
+xmltodict==0.12.0
+xxhash==3.5.0
+yarl==1.15.2
+zipp==3.19.2
+zope.event==5.0
+zope.interface==6.0

segment_hoi.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from segment_anything import SamPredictor, SamAutomaticMaskGenerator, sam_model_registry
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+def show_box(box, ax):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
+def merge_bounding_boxes(bbox1, bbox2):
+    xmin1, ymin1, xmax1, ymax1 = bbox1
+    xmin2, ymin2, xmax2, ymax2 = bbox2
+    xmin_merged = min(xmin1, xmin2)
+    ymin_merged = min(ymin1, ymin2)
+    xmax_merged = max(xmax1, xmax2)
+    ymax_merged = max(ymax1, ymax2)
+    return np.array([xmin_merged, ymin_merged, xmax_merged, ymax_merged])
+def init_sam(
+    device="cuda",
+    ckpt_path='/users/kchen157/scratch/weights/SAM/sam_vit_h_4b8939.pth'
+    ):
+    sam = sam_model_registry['vit_h'](checkpoint=ckpt_path)
+    sam.to(device=device)
+    predictor = SamPredictor(sam)
+    return predictor
+def segment_hand_and_object(
+    predictor,
+    image,
+    hand_kpts,
+    hand_mask=None,
+    box_shift_ratio = 0.3,
+    box_size_factor = 2.,
+    area_threshold = 0.2,
+    overlap_threshold = 200):
+    # Find bounding box for HOI
+    input_box = {}
+    for hand_type in ['right', 'left']:
+        if hand_type not in hand_kpts:
+            continue
+        input_box[hand_type] = np.stack([hand_kpts[hand_type].min(axis=0), hand_kpts[hand_type].max(axis=0)])
+        box_trans = input_box[hand_type][0] * box_shift_ratio + input_box[hand_type][1] * (1 - box_shift_ratio)
+        input_box[hand_type] = ((input_box[hand_type] - box_trans) * box_size_factor + box_trans).reshape(-1)
+    if len(input_box) == 2:
+        input_box = merge_bounding_boxes(input_box['right'], input_box['left'])
+        input_point = np.array([hand_kpts['right'][0], hand_kpts['left'][0]])
+        input_label = np.array([1, 1])
+    elif 'right' in input_box:
+        input_box = input_box['right']
+        input_point = np.array([hand_kpts['right'][0]])
+        input_label = np.array([1])
+    elif 'left' in input_box:
+        input_box = input_box['left']
+        input_point = np.array([hand_kpts['left'][0]])
+        input_label = np.array([1])
+    box_area = (input_box[2] - input_box[0]) * (input_box[3] - input_box[1])
+    # segment hand using the wrist point
+    predictor.set_image(image)
+    if hand_mask is None:
+        masks, scores, logits = predictor.predict(
+            point_coords=input_point,
+            point_labels=input_label,
+            multimask_output=False,
+        )
+        hand_mask = masks[0]
+    # segment object in hand
+    input_label = np.zeros_like(input_label)
+    masks, scores, _ = predictor.predict(
+        point_coords=input_point,
+        point_labels=input_label,
+        box=input_box[None, :],
+        multimask_output=False,
+    )
+    object_mask = masks[0]
+    if  (masks[0].astype(int) * hand_mask).sum() > overlap_threshold:
+        # print('False positive: The mask overlaps the hand.')
+        object_mask = np.zeros_like(object_mask)
+    elif object_mask.astype(int).sum() / box_area > area_threshold:
+        # print('False positive: The area is very big, probably the background')
+        object_mask = np.zeros_like(object_mask)
+    return object_mask, hand_mask

utils.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import io
+import cv2
+import numpy as np
+from PIL import Image
+from skimage.transform import resize
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+def draw_hand3d(keypoints):
+    # Define the connections between keypoints as tuples (start, end)
+    bones = [
+        ((0, 1), 'red'), ((1, 2), 'green'), ((2, 3), 'blue'), ((3, 4), 'purple'),
+        ((0, 5), 'orange'), ((5, 6), 'pink'), ((6, 7), 'brown'), ((7, 8), 'cyan'),
+        ((0, 9), 'yellow'), ((9, 10), 'magenta'), ((10, 11), 'lime'), ((11, 12), 'blueviolet'),
+        ((0, 13), 'olive'), ((13, 14), 'teal'), ((14, 15), 'crimson'), ((15, 16), 'cornsilk'),
+        ((0, 17), 'aqua'), ((17, 18), 'silver'), ((18, 19), 'maroon'), ((19, 20), 'fuchsia')
+    ]
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+    # Plot the bones
+    for bone, color in bones:
+        start_point = keypoints[bone[0], :]
+        end_point = keypoints[bone[1], :]
+        ax.plot([start_point[0], end_point[0]],
+                [start_point[1], end_point[1]],
+                [start_point[2], end_point[2]], color=color)
+    ax.scatter(keypoints[:, 0], keypoints[:, 1], keypoints[:, 2], color='gray', s=15)
+    # Set the aspect ratio to be equal
+    max_range = np.array([keypoints[:,0].max()-keypoints[:,0].min(),
+                          keypoints[:,1].max()-keypoints[:,1].min(),
+                          keypoints[:,2].max()-keypoints[:,2].min()]).max() / 2.0
+    mid_x = (keypoints[:,0].max()+keypoints[:,0].min()) * 0.5
+    mid_y = (keypoints[:,1].max()+keypoints[:,1].min()) * 0.5
+    mid_z = (keypoints[:,2].max()+keypoints[:,2].min()) * 0.5
+    ax.set_xlim(mid_x - max_range, mid_x + max_range)
+    ax.set_ylim(mid_y - max_range, mid_y + max_range)
+    ax.set_zlim(mid_z - max_range, mid_z + max_range)
+    # Set labels for axes
+    ax.set_xlabel('X')
+    ax.set_ylabel('Y')
+    ax.set_zlabel('Z')
+    plt.show()
+def visualize_hand(joints, img):
+# Define the connections between joints for drawing lines and their corresponding colors
+    connections = [
+        ((0, 1), 'red'), ((1, 2), 'green'), ((2, 3), 'blue'), ((3, 4), 'purple'),
+        ((0, 5), 'orange'), ((5, 6), 'pink'), ((6, 7), 'brown'), ((7, 8), 'cyan'),
+        ((0, 9), 'yellow'), ((9, 10), 'magenta'), ((10, 11), 'lime'), ((11, 12), 'indigo'),
+        ((0, 13), 'olive'), ((13, 14), 'teal'), ((14, 15), 'navy'), ((15, 16), 'gray'),
+        ((0, 17), 'lavender'), ((17, 18), 'silver'), ((18, 19), 'maroon'), ((19, 20), 'fuchsia')
+    ]
+    H, W, C = img.shape
+    # Create a figure and axis
+    plt.figure()
+    ax = plt.gca()
+    # Plot joints as points
+    ax.imshow(img)
+    ax.scatter(joints[:, 0], joints[:, 1], color='white', s=15)
+    # Plot lines connecting joints with different colors for each bone
+    for connection, color in connections:
+        joint1 = joints[connection[0]]
+        joint2 = joints[connection[1]]
+        ax.plot([joint1[0], joint2[0]], [joint1[1], joint2[1]], color=color)
+    ax.set_xlim([0, W])
+    ax.set_ylim([0, H])
+    ax.grid(False)
+    ax.set_axis_off()
+    ax.invert_yaxis()
+    plt.subplots_adjust(wspace=0.01)
+    plt.show()
+def draw_hand_skeleton(joints, image_size, thickness=5):
+    # Create a blank white image
+    image = np.zeros((image_size[0], image_size[1]), dtype=np.uint8)
+    # Define the connections between joints
+    connections = [
+        (0, 1),
+        (1, 2),
+        (2, 3),
+        (3, 4),
+        (0, 5),
+        (5, 6),
+        (6, 7),
+        (7, 8),
+        (0, 9),
+        (9, 10),
+        (10, 11),
+        (11, 12),
+        (0, 13),
+        (13, 14),
+        (14, 15),
+        (15, 16),
+        (0, 17),
+        (17, 18),
+        (18, 19),
+        (19, 20),
+    ]
+    # Draw lines connecting joints
+    for connection in connections:
+        joint1 = joints[connection[0]].astype("int")
+        joint2 = joints[connection[1]].astype("int")
+        cv2.line(image, tuple(joint1), tuple(joint2), color=1, thickness=thickness)
+    return image
+def draw_hand(joints, img):
+    # Define the connections between joints for drawing lines and their corresponding colors
+    connections = [
+        ((0, 1), 'red'), ((1, 2), 'green'), ((2, 3), 'blue'), ((3, 4), 'purple'),
+        ((0, 5), 'orange'), ((5, 6), 'pink'), ((6, 7), 'brown'), ((7, 8), 'cyan'),
+        ((0, 9), 'yellow'), ((9, 10), 'magenta'), ((10, 11), 'lime'), ((11, 12), 'indigo'),
+        ((0, 13), 'olive'), ((13, 14), 'teal'), ((14, 15), 'navy'), ((15, 16), 'gray'),
+        ((0, 17), 'lavender'), ((17, 18), 'silver'), ((18, 19), 'maroon'), ((19, 20), 'fuchsia')
+    ]
+    H, W, C = img.shape
+    # Create a figure and axis with the same size as the input image
+    fig, ax = plt.subplots(figsize=(W / 100, H / 100), dpi=100)
+    # Plot joints as points
+    ax.imshow(img)
+    ax.scatter(joints[:, 0], joints[:, 1], color='white', s=15)
+    # Plot lines connecting joints with different colors for each bone
+    for connection, color in connections:
+        joint1 = joints[connection[0]]
+        joint2 = joints[connection[1]]
+        ax.plot([joint1[0], joint2[0]], [joint1[1], joint2[1]], color=color)
+    ax.set_xlim([0, W])
+    ax.set_ylim([0, H])
+    ax.grid(False)
+    ax.set_axis_off()
+    ax.invert_yaxis()
+    plt.subplots_adjust(left=0, right=1, top=1, bottom=0, wspace=0.01, hspace=0.01)
+    # Save the plot to a buffer
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0)
+    plt.close(fig)  # Close the figure to free memory
+    # Load the image from the buffer into a PIL image and then into a numpy array
+    buf.seek(0)
+    img_arr = np.array(Image.open(buf))
+    return img_arr[..., :3]
+def keypoint_heatmap(pts, size, var=1.0):
+    H, W = size
+    x = np.linspace(0, W - 1, W)
+    y = np.linspace(0, H - 1, H)
+    xv, yv = np.meshgrid(x, y)
+    grid = np.stack((xv, yv), axis=-1)
+    # Expanding dims for broadcasting subtraction between pts and every grid position
+    modes_exp = np.expand_dims(np.expand_dims(pts, axis=1), axis=1)
+    # Calculating squared difference
+    diff = grid - modes_exp
+    normal = np.exp(-np.sum(diff**2, axis=-1) / (2 * var)) / (
+        2.0 * np.pi * var
+    )
+    return normal
+def check_keypoints_validity(keypoints, image_size):
+    H, W = image_size
+    # Check if x coordinates are valid: 0 < x < W
+    valid_x = (keypoints[:, 0] > 0) & (keypoints[:, 0] < W)
+    # Check if y coordinates are valid: 0 < y < H
+    valid_y = (keypoints[:, 1] > 0) & (keypoints[:, 1] < H)
+    # Combine the validity checks for both x and y
+    valid_keypoints = valid_x & valid_y
+    # Convert boolean array to integer (1 for True, 0 for False)
+    return valid_keypoints.astype(int)
+def find_bounding_box(mask, margin=30):
+    """Find the bounding box of a binary mask. Return None if the mask is empty."""
+    rows = np.any(mask, axis=1)
+    cols = np.any(mask, axis=0)
+    if not rows.any() or not cols.any():  # Mask is empty
+        return None
+    ymin, ymax = np.where(rows)[0][[0, -1]]
+    xmin, xmax = np.where(cols)[0][[0, -1]]
+    xmin -= margin
+    xmax += margin
+    ymin -= margin
+    ymax += margin
+    return xmin, ymin, xmax, ymax
+def adjust_box_to_image(xmin, ymin, xmax, ymax, image_width, image_height):
+    """Adjust the bounding box to fit within the image boundaries."""
+    box_width = xmax - xmin
+    box_height = ymax - ymin
+    # Determine the side length of the square (the larger of the two dimensions)
+    side_length = max(box_width, box_height)
+    # Adjust to maintain a square by expanding or contracting sides
+    xmin = max(0, xmin - (side_length - box_width) // 2)
+    xmax = xmin + side_length
+    ymin = max(0, ymin - (side_length - box_height) // 2)
+    ymax = ymin + side_length
+    # Ensure the box is still within the image boundaries after adjustments
+    if xmax > image_width:
+        shift = xmax - image_width
+        xmin -= shift
+        xmax -= shift
+    if ymax > image_height:
+        shift = ymax - image_height
+        ymin -= shift
+        ymax -= shift
+    # After shifting, double-check if any side is out-of-bounds and adjust if necessary
+    xmin = max(0, xmin)
+    ymin = max(0, ymin)
+    xmax = min(image_width, xmax)
+    ymax = min(image_height, ymax)
+    # It's possible the adjustments made the box not square (due to boundary constraints),
+    # so we might need to slightly adjust the size to keep it as square as possible
+    # This could involve a final adjustment based on the specific requirements,
+    # like reducing the side length to fit or deciding which dimension to prioritize.
+    return xmin, ymin, xmax, ymax
+def scale_keypoint(keypoint, original_size, target_size):
+    """Scale a keypoint based on the resizing of the image."""
+    keypoint_copy = keypoint.copy()
+    keypoint_copy[:, 0] *= target_size[0] / original_size[0]
+    keypoint_copy[:, 1] *= target_size[1] / original_size[1]
+    return keypoint_copy
+def crop_and_adjust_image_and_annotations(image, hand_mask, obj_mask, hand_pose, intrinsics, target_size=(512, 512)):
+    # Find bounding boxes for each mask, handling potentially empty masks
+    xmin, ymin, xmax, ymax = find_bounding_box(hand_mask) if np.any(hand_mask) else None
+    # Adjust bounding box to fit within the image and be square
+    xmin, ymin, xmax, ymax = adjust_box_to_image(xmin, ymin, xmax, ymax, image.shape[1], image.shape[0])
+    # Crop the image and mask
+    # masked_hand_image = (image * np.maximum(hand_mask, obj_mask)[..., None].astype(float)).astype(np.uint8)
+    cropped_hand_image = image[ymin:ymax, xmin:xmax]
+    cropped_hand_mask = hand_mask[ymin:ymax, xmin:xmax].astype(np.uint8)
+    cropped_obj_mask = obj_mask[ymin:ymax, xmin:xmax].astype(np.uint8)
+    # Resize the image
+    resized_image = resize(cropped_hand_image, target_size, anti_aliasing=True)
+    resized_hand_mask = cv2.resize(cropped_hand_mask, dsize=target_size, interpolation=cv2.INTER_NEAREST)
+    resized_obj_mask = cv2.resize(cropped_obj_mask, dsize=target_size, interpolation=cv2.INTER_NEAREST)
+    # adjust and scale 2d keypoints
+    for hand_type, kps2d in hand_pose.items():
+        kps2d[:, 0] -= xmin
+        kps2d[:, 1] -= ymin
+        hand_pose[hand_type] = scale_keypoint(kps2d, (xmax - xmin, ymax - ymin), target_size)
+    # adjust instrinsics
+    resized_intrinsics= np.array(intrinsics, copy=True)
+    resized_intrinsics[0, 2] -= xmin
+    resized_intrinsics[1, 2] -= ymin
+    resized_intrinsics[0, :] *= target_size[0] / (xmax - xmin)
+    resized_intrinsics[1, :] *= target_size[1] / (ymax - ymin)
+    return (resized_image, resized_hand_mask, resized_obj_mask, hand_pose, resized_intrinsics)

vit.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        latent_dim=4,
+        in_channels=47,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        learn_sigma=True,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = latent_dim * 2 if learn_sigma else latent_dim
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        #self.x_embedder = PatchEmbed(input_size, patch_size, latent_dim, hidden_size, bias=True)
+        self.feature_aligned_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.n_patches = self.feature_aligned_embedder.num_patches
+        self.patch_size = self.feature_aligned_embedder.patch_size[0]
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.nvs_label_embedder = LabelEmbedder(3, hidden_size, 0.)
+        self.pos_embed = nn.Parameter(torch.zeros(1, 2 * self.n_patches, hidden_size), requires_grad=True)
+        self.y_embedder = LabelEmbedder(num_classes=1000, hidden_size=hidden_size, dropout_prob=0.1)
+        self.blocks = nn.ModuleList([
+            DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        grid_size = int(self.n_patches ** 0.5)
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (2 * grid_size, grid_size))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        #w = self.x_embedder.proj.weight.data
+        #nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        #nn.init.constant_(self.x_embedder.proj.bias, 0)
+        w = self.feature_aligned_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.feature_aligned_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        nn.init.normal_(self.nvs_label_embedder.embedding_table.weight, std=0.02)
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.patch_size
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def forward(self, x_t, t, target_cond, ref_cond, nvs, y=None):
+        """
+        Forward pass of DiT.
+        x: (N, C1, H, W) denoising latent + target pose control
+        cond: (N, C2, H, W) source latent + source pose control + mask
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x = self.feature_aligned_embedder(torch.concat([x_t, target_cond], 1)) + self.pos_embed[:, :self.n_patches]
+        cond = self.feature_aligned_embedder(ref_cond) + self.pos_embed[:, self.n_patches:]
+        x = torch.concatenate([x, cond], 1)
+        t = self.t_embedder(t)                   # (N, D)
+        nvs = self.nvs_label_embedder(nvs, False)
+        if y is None:
+            y = torch.tensor([1000] * x.shape[0], device=x.device)
+        y = self.y_embedder(y, False)    # (N, D)
+        c = t + y + nvs                             # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, 2T, D)
+        x = x[:, :x.shape[1]//2]
+        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        return x
+    def forward_with_cfg(self, x, t, target_cond, ref_cond, nvs, cfg_scale):
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        y_null = torch.tensor([1000] * half.shape[0], device=x.device)
+        y = torch.cat([y_null, y_null], 0)
+        model_out = self.forward(combined, t, target_cond, ref_cond, nvs, y)
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def DiT_XL_2(**kwargs):
+    return DiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def DiT_L_2(**kwargs):
+    return DiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)

vqvae.py ADDED Viewed

	@@ -0,0 +1,507 @@

+"""
+---
+title: Autoencoder for Stable Diffusion
+summary: >
+ Annotated PyTorch implementation/tutorial of the autoencoder
+ for stable diffusion.
+---
+# Autoencoder for [Stable Diffusion](../index.html)
+This implements the auto-encoder model used to map between image space and latent space.
+We have kept to the model definition and naming unchanged from
+[CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
+so that we can load the checkpoints directly.
+"""
+from typing import List
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Autoencoder(nn.Module):
+    """
+    ## Autoencoder
+    This consists of the encoder and decoder modules.
+    """
+    def __init__(
+        self, encoder: "Encoder", decoder: "Decoder", emb_channels: int, z_channels: int
+    ):
+        """
+        :param encoder: is the encoder
+        :param decoder: is the decoder
+        :param emb_channels: is the number of dimensions in the quantized embedding space
+        :param z_channels: is the number of channels in the embedding space
+        """
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        # Convolution to map from embedding space to
+        # quantized embedding space moments (mean and log variance)
+        self.quant_conv = nn.Conv2d(2 * z_channels, 2 * emb_channels, 1)
+        # Convolution to map from quantized embedding space back to
+        # embedding space
+        self.post_quant_conv = nn.Conv2d(emb_channels, z_channels, 1)
+    def encode(self, img: torch.Tensor) -> "GaussianDistribution":
+        """
+        ### Encode images to latent representation
+        :param img: is the image tensor with shape `[batch_size, img_channels, img_height, img_width]`
+        """
+        # Get embeddings with shape `[batch_size, z_channels * 2, z_height, z_height]`
+        z = self.encoder(img)
+        # Get the moments in the quantized embedding space
+        moments = self.quant_conv(z)
+        # Return the distribution
+        return GaussianDistribution(moments)
+    def decode(self, z: torch.Tensor):
+        """
+        ### Decode images from latent representation
+        :param z: is the latent representation with shape `[batch_size, emb_channels, z_height, z_height]`
+        """
+        # Map to embedding space from the quantized representation
+        z = self.post_quant_conv(z)
+        # Decode the image of shape `[batch_size, channels, height, width]`
+        return self.decoder(z)
+    def forward(self, x):
+        posterior = self.encode(x)
+        z = posterior.sample()
+        dec = self.decode(z)
+        return dec, posterior
+class Encoder(nn.Module):
+    """
+    ## Encoder module
+    """
+    def __init__(
+        self,
+        *,
+        channels: int,
+        channel_multipliers: List[int],
+        n_resnet_blocks: int,
+        in_channels: int,
+        z_channels: int
+    ):
+        """
+        :param channels: is the number of channels in the first convolution layer
+        :param channel_multipliers: are the multiplicative factors for the number of channels in the
+            subsequent blocks
+        :param n_resnet_blocks: is the number of resnet layers at each resolution
+        :param in_channels: is the number of channels in the image
+        :param z_channels: is the number of channels in the embedding space
+        """
+        super().__init__()
+        # Number of blocks of different resolutions.
+        # The resolution is halved at the end each top level block
+        n_resolutions = len(channel_multipliers)
+        # Initial $3 \times 3$ convolution layer that maps the image to `channels`
+        self.conv_in = nn.Conv2d(in_channels, channels, 3, stride=1, padding=1)
+        # Number of channels in each top level block
+        channels_list = [m * channels for m in [1] + channel_multipliers]
+        # List of top-level blocks
+        self.down = nn.ModuleList()
+        # Create top-level blocks
+        for i in range(n_resolutions):
+            # Each top level block consists of multiple ResNet Blocks and down-sampling
+            resnet_blocks = nn.ModuleList()
+            # Add ResNet Blocks
+            for _ in range(n_resnet_blocks):
+                resnet_blocks.append(ResnetBlock(channels, channels_list[i + 1]))
+                channels = channels_list[i + 1]
+            # Top-level block
+            down = nn.Module()
+            down.block = resnet_blocks
+            # Down-sampling at the end of each top level block except the last
+            if i != n_resolutions - 1:
+                down.downsample = DownSample(channels)
+            else:
+                down.downsample = nn.Identity()
+            #
+            self.down.append(down)
+        # Final ResNet blocks with attention
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(channels, channels)
+        self.mid.attn_1 = AttnBlock(channels)
+        self.mid.block_2 = ResnetBlock(channels, channels)
+        # Map to embedding space with a $3 \times 3$ convolution
+        self.norm_out = normalization(channels)
+        self.conv_out = nn.Conv2d(channels, 2 * z_channels, 3, stride=1, padding=1)
+    def forward(self, img: torch.Tensor):
+        """
+        :param img: is the image tensor with shape `[batch_size, img_channels, img_height, img_width]`
+        """
+        # Map to `channels` with the initial convolution
+        x = self.conv_in(img)
+        # Top-level blocks
+        for down in self.down:
+            # ResNet Blocks
+            for block in down.block:
+                x = block(x)
+            # Down-sampling
+            x = down.downsample(x)
+        # Final ResNet blocks with attention
+        x = self.mid.block_1(x)
+        x = self.mid.attn_1(x)
+        x = self.mid.block_2(x)
+        # Normalize and map to embedding space
+        x = self.norm_out(x)
+        x = swish(x)
+        x = self.conv_out(x)
+        #
+        return x
+class Decoder(nn.Module):
+    """
+    ## Decoder module
+    """
+    def __init__(
+        self,
+        *,
+        channels: int,
+        channel_multipliers: List[int],
+        n_resnet_blocks: int,
+        out_channels: int,
+        z_channels: int
+    ):
+        """
+        :param channels: is the number of channels in the final convolution layer
+        :param channel_multipliers: are the multiplicative factors for the number of channels in the
+            previous blocks, in reverse order
+        :param n_resnet_blocks: is the number of resnet layers at each resolution
+        :param out_channels: is the number of channels in the image
+        :param z_channels: is the number of channels in the embedding space
+        """
+        super().__init__()
+        # Number of blocks of different resolutions.
+        # The resolution is halved at the end each top level block
+        num_resolutions = len(channel_multipliers)
+        # Number of channels in each top level block, in the reverse order
+        channels_list = [m * channels for m in channel_multipliers]
+        # Number of channels in the  top-level block
+        channels = channels_list[-1]
+        # Initial $3 \times 3$ convolution layer that maps the embedding space to `channels`
+        self.conv_in = nn.Conv2d(z_channels, channels, 3, stride=1, padding=1)
+        # ResNet blocks with attention
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(channels, channels)
+        self.mid.attn_1 = AttnBlock(channels)
+        self.mid.block_2 = ResnetBlock(channels, channels)
+        # List of top-level blocks
+        self.up = nn.ModuleList()
+        # Create top-level blocks
+        for i in reversed(range(num_resolutions)):
+            # Each top level block consists of multiple ResNet Blocks and up-sampling
+            resnet_blocks = nn.ModuleList()
+            # Add ResNet Blocks
+            for _ in range(n_resnet_blocks + 1):
+                resnet_blocks.append(ResnetBlock(channels, channels_list[i]))
+                channels = channels_list[i]
+            # Top-level block
+            up = nn.Module()
+            up.block = resnet_blocks
+            # Up-sampling at the end of each top level block except the first
+            if i != 0:
+                up.upsample = UpSample(channels)
+            else:
+                up.upsample = nn.Identity()
+            # Prepend to be consistent with the checkpoint
+            self.up.insert(0, up)
+        # Map to image space with a $3 \times 3$ convolution
+        self.norm_out = normalization(channels)
+        self.conv_out = nn.Conv2d(channels, out_channels, 3, stride=1, padding=1)
+    def forward(self, z: torch.Tensor):
+        """
+        :param z: is the embedding tensor with shape `[batch_size, z_channels, z_height, z_height]`
+        """
+        # Map to `channels` with the initial convolution
+        h = self.conv_in(z)
+        # ResNet blocks with attention
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # Top-level blocks
+        for up in reversed(self.up):
+            # ResNet Blocks
+            for block in up.block:
+                h = block(h)
+            # Up-sampling
+            h = up.upsample(h)
+        # Normalize and map to image space
+        h = self.norm_out(h)
+        h = swish(h)
+        img = self.conv_out(h)
+        #
+        return img
+class GaussianDistribution:
+    """
+    ## Gaussian Distribution
+    """
+    def __init__(self, parameters: torch.Tensor):
+        """
+        :param parameters: are the means and log of variances of the embedding of shape
+            `[batch_size, z_channels * 2, z_height, z_height]`
+        """
+        # Split mean and log of variance
+        self.mean, log_var = torch.chunk(parameters, 2, dim=1)
+        # Clamp the log of variances
+        self.log_var = torch.clamp(log_var, -30.0, 20.0)
+        # Calculate standard deviation
+        self.std = torch.exp(0.5 * self.log_var)
+        self.var = torch.exp(self.log_var)
+    def sample(self):
+        # Sample from the distribution
+        return self.mean + self.std * torch.randn_like(self.std)
+    def kl(self):
+        return 0.5 * torch.sum(
+            torch.pow(self.mean, 2) + self.var - 1.0 - self.log_var, dim=[1, 2, 3]
+        )
+class AttnBlock(nn.Module):
+    """
+    ## Attention block
+    """
+    def __init__(self, channels: int):
+        """
+        :param channels: is the number of channels
+        """
+        super().__init__()
+        # Group normalization
+        self.norm = normalization(channels)
+        # Query, key and value mappings
+        self.q = nn.Conv2d(channels, channels, 1)
+        self.k = nn.Conv2d(channels, channels, 1)
+        self.v = nn.Conv2d(channels, channels, 1)
+        # Final $1 \times 1$ convolution layer
+        self.proj_out = nn.Conv2d(channels, channels, 1)
+        # Attention scaling factor
+        self.scale = channels**-0.5
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: is the tensor of shape `[batch_size, channels, height, width]`
+        """
+        # Normalize `x`
+        x_norm = self.norm(x)
+        # Get query, key and vector embeddings
+        q = self.q(x_norm)
+        k = self.k(x_norm)
+        v = self.v(x_norm)
+        # Reshape to query, key and vector embeedings from
+        # `[batch_size, channels, height, width]` to
+        # `[batch_size, channels, height * width]`
+        b, c, h, w = q.shape
+        q = q.view(b, c, h * w)
+        k = k.view(b, c, h * w)
+        v = v.view(b, c, h * w)
+        # Compute $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)$
+        attn = torch.einsum("bci,bcj->bij", q, k) * self.scale
+        attn = F.softmax(attn, dim=2)
+        # Compute $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)V$
+        out = torch.einsum("bij,bcj->bci", attn, v)
+        # Reshape back to `[batch_size, channels, height, width]`
+        out = out.view(b, c, h, w)
+        # Final $1 \times 1$ convolution layer
+        out = self.proj_out(out)
+        # Add residual connection
+        return x + out
+class UpSample(nn.Module):
+    """
+    ## Up-sampling layer
+    """
+    def __init__(self, channels: int):
+        """
+        :param channels: is the number of channels
+        """
+        super().__init__()
+        # $3 \times 3$ convolution mapping
+        self.conv = nn.Conv2d(channels, channels, 3, padding=1)
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: is the input feature map with shape `[batch_size, channels, height, width]`
+        """
+        # Up-sample by a factor of $2$
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        # Apply convolution
+        return self.conv(x)
+class DownSample(nn.Module):
+    """
+    ## Down-sampling layer
+    """
+    def __init__(self, channels: int):
+        """
+        :param channels: is the number of channels
+        """
+        super().__init__()
+        # $3 \times 3$ convolution with stride length of $2$ to down-sample by a factor of $2$
+        self.conv = nn.Conv2d(channels, channels, 3, stride=2, padding=0)
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: is the input feature map with shape `[batch_size, channels, height, width]`
+        """
+        # Add padding
+        x = F.pad(x, (0, 1, 0, 1), mode="constant", value=0)
+        # Apply convolution
+        return self.conv(x)
+class ResnetBlock(nn.Module):
+    """
+    ## ResNet Block
+    """
+    def __init__(self, in_channels: int, out_channels: int):
+        """
+        :param in_channels: is the number of channels in the input
+        :param out_channels: is the number of channels in the output
+        """
+        super().__init__()
+        # First normalization and convolution layer
+        self.norm1 = normalization(in_channels)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride=1, padding=1)
+        # Second normalization and convolution layer
+        self.norm2 = normalization(out_channels)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1)
+        # `in_channels` to `out_channels` mapping layer for residual connection
+        if in_channels != out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels, out_channels, 1, stride=1, padding=0
+            )
+        else:
+            self.nin_shortcut = nn.Identity()
+    def forward(self, x: torch.Tensor):
+        """
+        :param x: is the input feature map with shape `[batch_size, channels, height, width]`
+        """
+        h = x
+        # First normalization and convolution layer
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        # Second normalization and convolution layer
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        # Map and add residual
+        return self.nin_shortcut(x) + h
+def swish(x: torch.Tensor):
+    """
+    ### Swish activation
+    """
+    return x * torch.sigmoid(x)
+def normalization(channels: int):
+    """
+    ### Group normalization
+    This is a helper function, with fixed number of groups and `eps`.
+    """
+    return nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6)
+def restore_ae_from_sd(model, path):
+    def remove_prefix(text, prefix):
+        if text.startswith(prefix):
+            return text[len(prefix) :]
+        return text
+    checkpoint = torch.load(path)
+    # checkpoint = torch.load(path, map_location="cpu")
+    ckpt_state_dict = checkpoint["state_dict"]
+    new_ckpt_state_dict = {}
+    for k, v in ckpt_state_dict.items():
+        new_k = remove_prefix(k, "first_stage_model.")
+        new_ckpt_state_dict[new_k] = v
+    missing_keys, extra_keys = model.load_state_dict(new_ckpt_state_dict, strict=False)
+    assert len(missing_keys) == 0
+def create_model(in_channels, out_channels, latent_dim=4):
+    encoder = Encoder(
+        z_channels=latent_dim,
+        in_channels=in_channels,
+        channels=128,
+        channel_multipliers=[1, 2, 4, 4],
+        n_resnet_blocks=2,
+    )
+    decoder = Decoder(
+        out_channels=out_channels,
+        z_channels=latent_dim,
+        channels=128,
+        channel_multipliers=[1, 2, 4, 4],
+        n_resnet_blocks=2,
+    )
+    autoencoder = Autoencoder(
+        emb_channels=latent_dim, encoder=encoder, decoder=decoder, z_channels=latent_dim
+    )
+    return autoencoder