stuff 2.0

Browse files

Files changed (3) hide show

caption/joy_single.py +236 -0
comfy_nodes/deep_shrink_mk2.py +129 -0
comfy_nodes/easy_aspects.py +103 -0

caption/joy_single.py ADDED Viewed

	@@ -0,0 +1,236 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Simplified JoyCaption - Generates captions for a single image input
+"""
+import os
+import argparse
+from pathlib import Path
+from PIL import Image
+import torch
+import torchvision.transforms.functional as TVF
+from transformers import (
+    AutoModel,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+from torch import nn
+import logging
+CLIP_PATH = "google/siglip-so400m-patch14-384"
+MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
+CHECKPOINT_PATH = Path(__file__).resolve().parent / "cgrkzexw-599808"
+class ImageAdapter(nn.Module):
+    def __init__(
+        self,
+        input_features: int,
+        output_features: int,
+        ln1: bool,
+        pos_emb: bool,
+        num_image_tokens: int,
+        deep_extract: bool,
+    ):
+        super().__init__()
+        self.deep_extract = deep_extract
+        if self.deep_extract:
+            input_features = input_features * 5
+        self.linear1 = nn.Linear(input_features, output_features)
+        self.activation = nn.GELU()
+        self.linear2 = nn.Linear(output_features, output_features)
+        self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
+        self.pos_emb = (
+            None
+            if not pos_emb
+            else nn.Parameter(torch.zeros(num_image_tokens, input_features))
+        )
+        self.other_tokens = nn.Embedding(3, output_features)
+        self.other_tokens.weight.data.normal_(mean=0.0, std=0.02)
+    def forward(self, vision_outputs):
+        if self.deep_extract:
+            x = torch.concat(
+                (
+                    vision_outputs[-2],
+                    vision_outputs[3],
+                    vision_outputs[7],
+                    vision_outputs[13],
+                    vision_outputs[20],
+                ),
+                dim=-1,
+            )
+        else:
+            x = vision_outputs[-2]
+        x = self.ln1(x)
+        if self.pos_emb is not None:
+            x = x + self.pos_emb
+        x = self.linear1(x)
+        x = self.activation(x)
+        x = self.linear2(x)
+        other_tokens = self.other_tokens(
+            torch.tensor([0, 1], device=self.other_tokens.weight.device).expand(
+                x.shape[0], -1
+            )
+        )
+        x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
+        return x
+class SimpleCaptioner:
+    def __init__(self):
+        self.clip_model = None
+        self.text_model = None
+        self.image_adapter = None
+        self.tokenizer = None
+    def load_models(self):
+        logging.info("Loading CLIP")
+        self.clip_model = AutoModel.from_pretrained(CLIP_PATH)
+        self.clip_model = self.clip_model.vision_model
+        if (CHECKPOINT_PATH / "clip_model.pt").exists():
+            checkpoint = torch.load(
+                CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
+            )
+            checkpoint = {
+                k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()
+            }
+            self.clip_model.load_state_dict(checkpoint)
+        self.clip_model.eval()
+        self.clip_model.requires_grad_(False)
+        self.clip_model.to("cuda")
+        logging.info("Loading tokenizer and LLM")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            CHECKPOINT_PATH / "text_model", use_fast=True
+        )
+        if (CHECKPOINT_PATH / "text_model").exists():
+            self.text_model = AutoModelForCausalLM.from_pretrained(
+                CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16
+            )
+        else:
+            self.text_model = AutoModelForCausalLM.from_pretrained(
+                MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16
+            )
+        self.text_model.eval()
+        logging.info("Loading image adapter")
+        self.image_adapter = ImageAdapter(
+            self.clip_model.config.hidden_size,
+            self.text_model.config.hidden_size,
+            False,
+            False,
+            38,
+            False,
+        )
+        self.image_adapter.load_state_dict(
+            torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu")
+        )
+        self.image_adapter.eval()
+        self.image_adapter.to("cuda")
+    @torch.no_grad()
+    def generate_caption(self, image_path: str) -> str:
+        # Load and preprocess image
+        input_image = Image.open(image_path).convert("RGB")
+        image = input_image.resize((384, 384), Image.LANCZOS)
+        pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
+        pixel_values = TVF.normalize(pixel_values, [0.5], [0.5]).to("cuda")
+        # Generate image embeddings
+        vision_outputs = self.clip_model(pixel_values=pixel_values, output_hidden_states=True)
+        embedded_images = self.image_adapter(vision_outputs.hidden_states)
+        # Prepare prompt
+        prompt = "Write a descriptive caption for this image in a formal tone."
+        convo = [
+            {"role": "system", "content": "You are a helpful image captioner."},
+            {"role": "user", "content": prompt},
+        ]
+        convo_string = self.tokenizer.apply_chat_template(
+            convo, tokenize=False, add_generation_prompt=True
+        )
+        # Tokenize and prepare inputs
+        convo_tokens = self.tokenizer.encode(
+            convo_string, return_tensors="pt", add_special_tokens=False
+        )
+        prompt_tokens = self.tokenizer.encode(
+            prompt, return_tensors="pt", add_special_tokens=False
+        )
+        eot_id_indices = (
+            (convo_tokens == self.tokenizer.convert_tokens_to_ids("<|eot_id|>"))
+            .nonzero(as_tuple=True)[0]
+            .tolist()
+        )
+        preamble_len = eot_id_indices[1] - prompt_tokens.shape[1]
+        convo_embeds = self.text_model.model.embed_tokens(convo_tokens.to("cuda"))
+        input_embeds = torch.cat(
+            [
+                convo_embeds[:, :preamble_len],
+                embedded_images.to(dtype=convo_embeds.dtype),
+                convo_embeds[:, preamble_len:],
+            ],
+            dim=1,
+        )
+        input_ids = torch.cat(
+            [
+                convo_tokens[:, :preamble_len],
+                torch.zeros((1, embedded_images.shape[1]), dtype=torch.long, device="cuda"),
+                convo_tokens[:, preamble_len:],
+            ],
+            dim=1,
+        )
+        attention_mask = torch.ones_like(input_ids)
+        # Generate caption
+        generate_ids = self.text_model.generate(
+            input_ids,
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            max_new_tokens=300,
+            do_sample=True,
+            repetition_penalty=1.2,
+        )
+        # Decode caption
+        generate_ids = generate_ids[:, input_ids.shape[1]:]
+        if generate_ids[0][-1] == self.tokenizer.eos_token_id or generate_ids[0][-1] == self.tokenizer.convert_tokens_to_ids("<|eot_id|>"):
+            generate_ids = generate_ids[:, :-1]
+        caption = self.tokenizer.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )[0]
+        return caption.strip()
+def main():
+    parser = argparse.ArgumentParser(description="Generate a caption for a single image")
+    parser.add_argument("image_path", type=str, help="Path to the input image")
+    args = parser.parse_args()
+    # Setup logging
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
+    # Initialize and load the captioner
+    captioner = SimpleCaptioner()
+    captioner.load_models()
+    # Generate and print caption
+    caption = captioner.generate_caption(args.image_path)
+    print(f"\nGenerated caption:\n{caption}")
+if __name__ == "__main__":
+    main()

comfy_nodes/deep_shrink_mk2.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+import comfy.utils
+class PatchModelAddDownscale_v2:
+    """A UNet model patch that implements dynamic latent downscaling with gradual transition.
+    This node is an enhanced version of the original PatchModelAddDownscale that adds smooth
+    transition capabilities. It operates in three phases:
+    1. Full Downscale (start_percent → end_percent):
+       Latents are downscaled by the specified downscale_factor
+    2. Gradual Transition (end_percent → gradual_percent):
+       Latents smoothly transition from downscaled size back to original size
+    3. Original Size (after gradual_percent):
+       Latents remain at their original size
+    The gradual transition helps prevent abrupt changes in the generation process,
+    potentially leading to more consistent results.
+    Parameters:
+        model: The model to patch
+        block_number: Which UNet block to apply the patch to
+        downscale_factor: How much to shrink the latents by
+        start_percent: When to start downscaling (in terms of sampling progress)
+        end_percent: When to begin transitioning back to original size
+        gradual_percent: When to complete the transition to original size
+        downscale_after_skip: Whether to apply downscaling after skip connections
+        downscale_method: Algorithm to use for downscaling
+        upscale_method: Algorithm to use for upscaling
+    Code by:
+    - https://github.com/Jordach
+    """
+    upscale_methods = ["bicubic", "nearest-exact", "bilinear", "area", "bislerp"]
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "model": ("MODEL",),
+            "block_number": ("INT", {"default": 3, "min": 1, "max": 32, "step": 1}),
+            "downscale_factor": ("FLOAT", {"default": 2.0, "min": 0.1, "max": 9.0, "step": 0.001}),
+            "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
+            "end_percent": ("FLOAT", {"default": 0.35, "min": 0.0, "max": 1.0, "step": 0.001}),
+            "gradual_percent": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 1.0, "step": 0.001}),
+            "downscale_after_skip": ("BOOLEAN", {"default": True}),
+            "downscale_method": (s.upscale_methods,),
+            "upscale_method": (s.upscale_methods,),
+        }}
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "patch"
+    CATEGORY = "model_patches/unet"
+    def calculate_upscale_factor(self, current_percent, end_percent, gradual_percent, downscale_factor):
+        """Calculate the upscale factor during the gradual resize phase"""
+        if current_percent <= end_percent:
+            return 1.0 / downscale_factor  # Still fully downscaled
+        elif current_percent >= gradual_percent:
+            return 1.0  # Fully back to original size
+        else:
+            # Linear interpolation between downscaled and original size
+            progress = (current_percent - end_percent) / (gradual_percent - end_percent)
+            scale_diff = 1.0 - (1.0 / downscale_factor)
+            return (1.0 / downscale_factor) + (scale_diff * progress)
+    def patch(self, model, block_number, downscale_factor, start_percent, end_percent,
+             gradual_percent, downscale_after_skip, downscale_method, upscale_method):
+        model_sampling = model.get_model_object("model_sampling")
+        sigma_start = model_sampling.percent_to_sigma(start_percent)
+        sigma_end = model_sampling.percent_to_sigma(end_percent)
+        sigma_rescale = model_sampling.percent_to_sigma(gradual_percent)
+        def input_block_patch(h, transformer_options):
+            if downscale_factor == 1:
+                return h
+            if transformer_options["block"][1] == block_number:
+                sigma = transformer_options["sigmas"][0].item()
+                # Normal downscale behavior between start_percent and end_percent
+                if sigma <= sigma_start and sigma >= sigma_end:
+                    h = comfy.utils.common_upscale(
+                        h,
+                        round(h.shape[-1] * (1.0 / downscale_factor)),
+                        round(h.shape[-2] * (1.0 / downscale_factor)),
+                        downscale_method,
+                        "disabled"
+                    )
+                # Gradually upscale latent after end_percent until gradual_percent
+                elif sigma < sigma_end and sigma >= sigma_rescale:
+                    scale_factor = self.calculate_upscale_factor(
+                        sigma, sigma_rescale, sigma_end, downscale_factor
+                    )
+                    h = comfy.utils.common_upscale(
+                        h,
+                        round(h.shape[-1] * scale_factor),
+                        round(h.shape[-2] * scale_factor),
+                        upscale_method,
+                        "disabled"
+                    )
+            return h
+        def output_block_patch(h, hsp, transformer_options):
+            if h.shape[2] != hsp.shape[2]:
+                h = comfy.utils.common_upscale(
+                    h, hsp.shape[-1], hsp.shape[-2],
+                    upscale_method, "disabled"
+                )
+            return h, hsp
+        m = model.clone()
+        if downscale_after_skip:
+            m.set_model_input_block_patch_after_skip(input_block_patch)
+        else:
+            m.set_model_input_block_patch(input_block_patch)
+        m.set_model_output_block_patch(output_block_patch)
+        return (m, )
+NODE_CLASS_MAPPINGS = {
+    "PatchModelAddDownscale_v2": PatchModelAddDownscale_v2,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    # Sampling
+    "PatchModelAddDownscale_v2": "PatchModelAddDownscale v2",
+}

comfy_nodes/easy_aspects.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import math
+class AutoImageSize:
+	"""A utility node that automatically calculates optimal image dimensions and parameters.
+	This node helps create properly scaled images while maintaining desired aspect ratios
+	and managing performance through compression factors. It also automatically adjusts
+	denoise strength based on the output resolution.
+	Features:
+	- Maintains exact aspect ratios while ensuring dimensions are divisible by the compression factor
+	- Automatically calculates appropriate denoise strength based on resolution scaling
+	- Supports both portrait and landscape orientations
+	- Prevents downscaling below base resolution
+	Parameters:
+		aspect_ratio: The desired width/height ratio (1.0 = square, >1 = wider/taller)
+		orientation: Whether the image should be portrait or landscape
+		target_resolution: The desired maximum dimension in pixels
+		base_resolution: The model's native resolution (usually 1024)
+		compression_factor: Ensures dimensions are divisible by this value (usually 8 for VAEs)
+	Returns:
+		WIDTH: The calculated image width
+		HEIGHT: The calculated image height
+		DOWNSCALE_FACTOR: The scaling factor relative to base_resolution
+		DENOISE_STRENGTH: Automatically adjusted denoise strength (0.1-0.65)
+	The denoise strength calculation uses an exponential decay curve fitted to known good values:
+	- 1.0x (1024px) → 0.75
+	- 1.5x (1536px) → 0.45
+	- 2.0x (2048px) → 0.2
+    Code by:
+    - https://github.com/Jordach
+	"""
+	@classmethod
+	def INPUT_TYPES(s):
+		return {
+			"required": {
+				"aspect_ratio": ("FLOAT", {"default": 1, "min": 1, "max": 8, "step": 0.01}),
+				"orientation": (["portrait", "landscape"],),
+				"target_resolution": ("INT", {"default": 1024, "min": 256, "max": 1024*8, "step": 1}),
+				"base_resolution": ("INT", {"default": 1024, "min": 256, "max": 1024*8, "step": 1}),
+				"compression_factor": ("INT", {"default": 8, "min": 1, "max": 64, "step": 1}),
+			}
+		}
+	RETURN_TYPES = ("INT", "INT", "FLOAT", "FLOAT")
+	RETURN_NAMES = ("WIDTH", "HEIGHT", "DOWNSCALE_FACTOR", "DENOISE_STRENGTH")
+	FUNCTION = "create_res"
+	CATEGORY = "utils"
+	def calculate_denoise_strength(self, scale_factor):
+		"""
+		Calculate appropriate denoise strength based on resolution scale factor.
+		Uses exponential decay curve fitted to known good values:
+		- 1.0x (1024px) → 0.75
+		- 1.5x (1536px) → 0.45
+		- 2.0x (2048px) → 0.2
+		"""
+		# Base denoise value for 1024px (scale_factor = 1.0)
+		base_denoise = 0.95
+		# Calculate denoise strength using exponential decay
+		# Formula: denoise = base_denoise * e^(-k * (scale_factor - 1))
+		# where k is calculated to fit our known points
+		# Decay constant fitted to match reference points
+		k = 1.55
+		denoise = base_denoise * math.exp(-k * (scale_factor - 1))
+		d_min = 0.1
+		d_max = 0.65
+		# Clamp the result between 0.1 and 0.6
+		return max(d_min, min(d_max, denoise))
+	def create_res(self, aspect_ratio, orientation, target_resolution, base_resolution, compression_factor):
+		# Prevent cases where DOWNSCALE_FACTOR can be < 1
+		if target_resolution < base_resolution:
+			target_resolution = base_resolution
+		w, h = target_resolution, target_resolution
+		if orientation == "portrait":
+			w = int((((target_resolution**2)/aspect_ratio)**0.5)//compression_factor)*compression_factor
+			h = int((((target_resolution**2)*aspect_ratio)**0.5)//compression_factor)*compression_factor
+		elif orientation == "landscape":
+			w = int((((target_resolution**2)*aspect_ratio)**0.5)//compression_factor)*compression_factor
+			h = int((((target_resolution**2)/aspect_ratio)**0.5)//compression_factor)*compression_factor
+		scale_factor = target_resolution/base_resolution
+		denoise_strength = self.calculate_denoise_strength(scale_factor)
+		return (w, h, scale_factor, denoise_strength)
+NODE_CLASS_MAPPINGS = {
+	"JDC_AutoImageSize": AutoImageSize
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+	"JDC_AutoImageSize": "Easy Aspect Ratios"
+}