|
import torch |
|
import nodes |
|
import comfy.utils |
|
|
|
def camera_embeddings(elevation, azimuth): |
|
elevation = torch.as_tensor([elevation]) |
|
azimuth = torch.as_tensor([azimuth]) |
|
embeddings = torch.stack( |
|
[ |
|
torch.deg2rad( |
|
(90 - elevation) - (90) |
|
), |
|
torch.sin(torch.deg2rad(azimuth)), |
|
torch.cos(torch.deg2rad(azimuth)), |
|
torch.deg2rad( |
|
90 - torch.full_like(elevation, 0) |
|
), |
|
], dim=-1).unsqueeze(1) |
|
|
|
return embeddings |
|
|
|
|
|
class StableZero123_Conditioning: |
|
@classmethod |
|
def INPUT_TYPES(s): |
|
return {"required": { "clip_vision": ("CLIP_VISION",), |
|
"init_image": ("IMAGE",), |
|
"vae": ("VAE",), |
|
"width": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), |
|
"height": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), |
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), |
|
"elevation": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), |
|
"azimuth": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), |
|
}} |
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") |
|
RETURN_NAMES = ("positive", "negative", "latent") |
|
|
|
FUNCTION = "encode" |
|
|
|
CATEGORY = "conditioning/3d_models" |
|
|
|
def encode(self, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth): |
|
output = clip_vision.encode_image(init_image) |
|
pooled = output.image_embeds.unsqueeze(0) |
|
pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) |
|
encode_pixels = pixels[:,:,:,:3] |
|
t = vae.encode(encode_pixels) |
|
cam_embeds = camera_embeddings(elevation, azimuth) |
|
cond = torch.cat([pooled, cam_embeds.to(pooled.device).repeat((pooled.shape[0], 1, 1))], dim=-1) |
|
|
|
positive = [[cond, {"concat_latent_image": t}]] |
|
negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]] |
|
latent = torch.zeros([batch_size, 4, height // 8, width // 8]) |
|
return (positive, negative, {"samples":latent}) |
|
|
|
class StableZero123_Conditioning_Batched: |
|
@classmethod |
|
def INPUT_TYPES(s): |
|
return {"required": { "clip_vision": ("CLIP_VISION",), |
|
"init_image": ("IMAGE",), |
|
"vae": ("VAE",), |
|
"width": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), |
|
"height": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), |
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), |
|
"elevation": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), |
|
"azimuth": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), |
|
"elevation_batch_increment": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), |
|
"azimuth_batch_increment": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), |
|
}} |
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") |
|
RETURN_NAMES = ("positive", "negative", "latent") |
|
|
|
FUNCTION = "encode" |
|
|
|
CATEGORY = "conditioning/3d_models" |
|
|
|
def encode(self, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth, elevation_batch_increment, azimuth_batch_increment): |
|
output = clip_vision.encode_image(init_image) |
|
pooled = output.image_embeds.unsqueeze(0) |
|
pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) |
|
encode_pixels = pixels[:,:,:,:3] |
|
t = vae.encode(encode_pixels) |
|
|
|
cam_embeds = [] |
|
for i in range(batch_size): |
|
cam_embeds.append(camera_embeddings(elevation, azimuth)) |
|
elevation += elevation_batch_increment |
|
azimuth += azimuth_batch_increment |
|
|
|
cam_embeds = torch.cat(cam_embeds, dim=0) |
|
cond = torch.cat([comfy.utils.repeat_to_batch_size(pooled, batch_size), cam_embeds], dim=-1) |
|
|
|
positive = [[cond, {"concat_latent_image": t}]] |
|
negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]] |
|
latent = torch.zeros([batch_size, 4, height // 8, width // 8]) |
|
return (positive, negative, {"samples":latent, "batch_index": [0] * batch_size}) |
|
|
|
class SV3D_Conditioning: |
|
@classmethod |
|
def INPUT_TYPES(s): |
|
return {"required": { "clip_vision": ("CLIP_VISION",), |
|
"init_image": ("IMAGE",), |
|
"vae": ("VAE",), |
|
"width": ("INT", {"default": 576, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), |
|
"height": ("INT", {"default": 576, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), |
|
"video_frames": ("INT", {"default": 21, "min": 1, "max": 4096}), |
|
"elevation": ("FLOAT", {"default": 0.0, "min": -90.0, "max": 90.0, "step": 0.1, "round": False}), |
|
}} |
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") |
|
RETURN_NAMES = ("positive", "negative", "latent") |
|
|
|
FUNCTION = "encode" |
|
|
|
CATEGORY = "conditioning/3d_models" |
|
|
|
def encode(self, clip_vision, init_image, vae, width, height, video_frames, elevation): |
|
output = clip_vision.encode_image(init_image) |
|
pooled = output.image_embeds.unsqueeze(0) |
|
pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) |
|
encode_pixels = pixels[:,:,:,:3] |
|
t = vae.encode(encode_pixels) |
|
|
|
azimuth = 0 |
|
azimuth_increment = 360 / (max(video_frames, 2) - 1) |
|
|
|
elevations = [] |
|
azimuths = [] |
|
for i in range(video_frames): |
|
elevations.append(elevation) |
|
azimuths.append(azimuth) |
|
azimuth += azimuth_increment |
|
|
|
positive = [[pooled, {"concat_latent_image": t, "elevation": elevations, "azimuth": azimuths}]] |
|
negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t), "elevation": elevations, "azimuth": azimuths}]] |
|
latent = torch.zeros([video_frames, 4, height // 8, width // 8]) |
|
return (positive, negative, {"samples":latent}) |
|
|
|
|
|
NODE_CLASS_MAPPINGS = { |
|
"StableZero123_Conditioning": StableZero123_Conditioning, |
|
"StableZero123_Conditioning_Batched": StableZero123_Conditioning_Batched, |
|
"SV3D_Conditioning": SV3D_Conditioning, |
|
} |
|
|