k4d3
/

toolkit

Safetensors

Model card Files Files and versions Community

k4d3 commited on Sep 25, 2024

Commit

2e83eb5

1 Parent(s): c45b591

update joy and crawl UwU

Browse files

Files changed (2) hide show

crawl/crawl +13 -6
joy +205 -132

crawl/crawl CHANGED Viewed

@@ -16,7 +16,7 @@ import platform
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import time
 import argparse
-from urllib.parse import urljoin
 import requests
 try:
     from crawl4ai import WebCrawler  # type: ignore
@@ -75,11 +75,18 @@ def download_image(session, image_url, save_dir, base_url):
             The base URL of the page being crawled.
     """
     try:
-        # Ensure the URL has a scheme
-        if image_url.startswith(".."):
-            image_url = urljoin(base_url, image_url)
-        elif not re.match(r"^https?://", image_url):
-            image_url = "https://" + image_url.lstrip("/")
         image_filename = os.path.basename(image_url).split("?")[0]
         sanitized_image_filename = sanitize_filename(image_filename)

 from concurrent.futures import ThreadPoolExecutor, as_completed
 import time
 import argparse
+from urllib.parse import urljoin, urlparse
 import requests
 try:
     from crawl4ai import WebCrawler  # type: ignore
             The base URL of the page being crawled.
     """
     try:
+        # Parse the base URL to get the scheme and netloc
+        parsed_base_url = urlparse(base_url)
+        base_image_url = (
+            f"{parsed_base_url.scheme}://{parsed_base_url.netloc}/"
+        )
+        # Ensure the URL has a scheme and is properly joined with
+        # the base image URL
+        if not re.match(r"^https?://", image_url):
+            image_url = urljoin(
+                base_image_url, image_url.lstrip("/")
+            )
         image_filename = os.path.basename(image_url).split("?")[0]
         sanitized_image_filename = sanitize_filename(image_filename)

joy CHANGED Viewed

@@ -18,7 +18,6 @@ import os
 import argparse
 import re
 import random
-from collections import Counter
 from pathlib import Path
 from PIL import Image
 import pillow_jxl
@@ -26,14 +25,14 @@ import torch
 import torchvision.transforms.functional as TVF
 from transformers import (
     AutoModel,
-    AutoProcessor,
     AutoTokenizer,
     AutoModelForCausalLM,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
 )
 from torch import nn
-from e6db_reader import TagNormalizer, TagSetNormalizer, tag_category2id, tag_rank_to_freq
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
@@ -63,8 +62,7 @@ CAPTION_TYPE_MAP = {
         "Write a stable diffusion prompt for this image."
     ],
     ("training_prompt", "formal", False, True): [
-        "Write a stable diffusion prompt for this image within {word_count} "
-        "words."
     ],
     ("training_prompt", "formal", True, False): [
         "Write a {length} stable diffusion prompt for this image."
@@ -82,6 +80,7 @@ CAPTION_TYPE_MAP = {
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 class ImageAdapter(nn.Module):
     """
     Custom image adapter module for processing CLIP vision outputs.
@@ -118,8 +117,10 @@ class ImageAdapter(nn.Module):
         self.activation = nn.GELU()
         self.linear2 = nn.Linear(output_features, output_features)
         self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
-        self.pos_emb = None if not pos_emb else nn.Parameter(
-            torch.zeros(num_image_tokens, input_features)
         )
         self.other_tokens = nn.Embedding(3, output_features)
@@ -136,26 +137,29 @@ class ImageAdapter(nn.Module):
             torch.Tensor: Adapted image features.
         """
         if self.deep_extract:
-            x = torch.concat((
-                vision_outputs[-2],
-                vision_outputs[3],
-                vision_outputs[7],
-                vision_outputs[13],
-                vision_outputs[20],
-            ), dim=-1)
-            assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"
-            assert x.shape[-1] == vision_outputs[-2].shape[-1] * 5, (
-                f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
             )
         else:
             x = vision_outputs[-2]
         x = self.ln1(x)
         if self.pos_emb is not None:
-            assert x.shape[-2:] == self.pos_emb.shape, (
-                f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
-            )
             x = x + self.pos_emb
         x = self.linear1(x)
@@ -167,9 +171,11 @@ class ImageAdapter(nn.Module):
                 x.shape[0], -1
             )
         )
-        assert other_tokens.shape == (x.shape[0], 2, x.shape[2]), (
-            f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
-        )
         x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
         return x
@@ -185,6 +191,7 @@ class ImageAdapter(nn.Module):
             torch.tensor([2], device=self.other_tokens.weight.device)
         ).squeeze(0)
 class JoyCaptionModel:
     """
     A class for generating captions for images using CLIP, LLM, and custom image adapters.
@@ -221,8 +228,12 @@ class JoyCaptionModel:
         if (CHECKPOINT_PATH / "clip_model.pt").exists():
             print("Loading VLM's custom vision model")
-            checkpoint = torch.load(CHECKPOINT_PATH / "clip_model.pt", map_location='cpu')
-            checkpoint = {k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()}
             self.clip_model.load_state_dict(checkpoint)
             del checkpoint
@@ -240,15 +251,11 @@ class JoyCaptionModel:
         if (CHECKPOINT_PATH / "text_model").exists():
             print("Loading VLM's custom text model")
             self.text_model = AutoModelForCausalLM.from_pretrained(
-                CHECKPOINT_PATH / "text_model",
-                device_map=0,
-                torch_dtype=torch.bfloat16
             )
         else:
             self.text_model = AutoModelForCausalLM.from_pretrained(
-                MODEL_PATH,
-                device_map="auto",
-                torch_dtype=torch.bfloat16
             )
         self.text_model.eval()
@@ -260,7 +267,7 @@ class JoyCaptionModel:
             False,
             False,
             38,
-            False
         )
         self.image_adapter.load_state_dict(
             torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu")
@@ -269,12 +276,14 @@ class JoyCaptionModel:
         self.image_adapter.to("cuda")
     @torch.no_grad()
-    def process_image(self,
-                     input_image: Image.Image,
-                     caption_type: str,
-                     caption_tone: str,
-                     caption_length: str | int,
-                     custom_prompt: str | None = None) -> str:
         """
         Process an input image and generate a caption based on specified parameters.
         """
@@ -283,14 +292,18 @@ class JoyCaptionModel:
         if custom_prompt is not None:
             prompt_str = custom_prompt
         else:
-            prompt_str = self._get_prompt_string(caption_type, caption_tone, caption_length)
         print(f"Prompt: {prompt_str}")
         pixel_values = self._preprocess_image(input_image)
         prompt = self._tokenize_prompt(prompt_str)
         embedded_images = self._embed_image(pixel_values)
-        inputs_embeds, input_ids, attention_mask = self._construct_inputs(embedded_images, prompt)
         generate_ids = self._generate_caption(inputs_embeds, input_ids, attention_mask)
         caption = self._decode_caption(generate_ids, input_ids)
@@ -313,7 +326,7 @@ class JoyCaptionModel:
             caption_type,
             caption_tone,
             isinstance(length, str),
-            isinstance(length, int)
         )
         if prompt_key not in CAPTION_TYPE_MAP:
             raise ValueError(f"Invalid caption type: {prompt_key}")
@@ -327,57 +340,73 @@ class JoyCaptionModel:
         image = input_image.resize((384, 384), Image.LANCZOS)
         pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
         pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-        pixel_values = pixel_values.to('cuda')
         return pixel_values
     def _tokenize_prompt(self, prompt_str):
         prompt = self.tokenizer.encode(
             prompt_str,
-            return_tensors='pt',
             padding=False,
             truncation=False,
-            add_special_tokens=False
         )
         return prompt
     def _embed_image(self, pixel_values):
-        with torch.amp.autocast_mode.autocast('cuda', enabled=True):
-            vision_outputs = self.clip_model(pixel_values=pixel_values, output_hidden_states=True)
             image_features = vision_outputs.hidden_states
             embedded_images = self.image_adapter(image_features)
-            embedded_images = embedded_images.to('cuda')
         return embedded_images
     def _construct_inputs(self, embedded_images, prompt):
-        prompt_embeds = self.text_model.model.embed_tokens(prompt.to('cuda'))
-        assert prompt_embeds.shape == (1, prompt.shape[1], self.text_model.config.hidden_size), (
             f"Prompt shape is {prompt_embeds.shape}, expected "
             f"{(1, prompt.shape[1], self.text_model.config.hidden_size)}"
         )
         embedded_bos = self.text_model.model.embed_tokens(
-            torch.tensor([[self.tokenizer.bos_token_id]],
-                         device=self.text_model.device,
-                         dtype=torch.int64)
         )
-        eot_embed = self.image_adapter.get_eot_embedding().unsqueeze(0).to(
-            dtype=self.text_model.dtype
         )
-        inputs_embeds = torch.cat([
-            embedded_bos.expand(embedded_images.shape[0], -1, -1),
-            embedded_images.to(dtype=embedded_bos.dtype),
-            prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-            eot_embed.expand(embedded_images.shape[0], -1, -1),
-        ], dim=1)
-        input_ids = torch.cat([
-            torch.tensor([[self.tokenizer.bos_token_id]], dtype=torch.long),
-            torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
-            prompt,
-            torch.tensor([[self.tokenizer.eos_token_id]], dtype=torch.long),
-        ], dim=1).to('cuda')
         attention_mask = torch.ones_like(input_ids)
         return inputs_embeds, input_ids, attention_mask
@@ -389,21 +418,20 @@ class JoyCaptionModel:
             attention_mask=attention_mask,
             max_new_tokens=300,
             do_sample=True,
-            suppress_tokens=None
         )
         return generate_ids
     def _decode_caption(self, generate_ids, input_ids):
-        generate_ids = generate_ids[:, input_ids.shape[1]:]
-        if (generate_ids[0][-1] == self.tokenizer.eos_token_id or
-            generate_ids[0][-1] == self.tokenizer.convert_tokens_to_ids("<|eot_id|>")):
             generate_ids = generate_ids[:, :-1]
         caption = self.tokenizer.batch_decode(
-            generate_ids,
-            skip_special_tokens=False,
-            clean_up_tokenization_spaces=False
         )[0]
         return caption
@@ -413,53 +441,52 @@ def main():
     parser = argparse.ArgumentParser(
         description="Generate captions for images in a directory and save them as .caption files."
     )
-    parser.add_argument("directory", type=str, help="Target directory containing images.")
     parser.add_argument(
         "--caption_type",
         type=str,
         default="descriptive",
         choices=["descriptive", "training_prompt", "rng-tags", "custom"],
-        help="Type of caption to generate."
     )
     parser.add_argument(
         "--caption_tone",
         type=str,
         default="formal",
         choices=["formal", "informal"],
-        help="Tone of the caption."
     )
     parser.add_argument(
-        "--caption_length",
-        type=str,
-        default="any",
-        help="Length of the caption."
     )
     parser.add_argument(
         "--dont-strip-commas",
         action="store_true",
-        help="If set, commas will not be stripped from the generated captions."
     )
     parser.add_argument(
         "--custom_prompt",
         type=str,
-        help="Custom prompt for the captioner. Use with --caption_type custom."
     )
     parser.add_argument(
-        '--add-commas-to-sentence-ends',
-        action='store_true',
-        help='Add commas after periods in sentences'
     )
     parser.add_argument(
-        '--feed-from-tags',
         type=int,
-        nargs='?',
         const=-1,
-        help='Use .txt files with the same base filename as the images as input to the captioner. Optionally specify the number of tags to use.'
     )
     parser.add_argument(
-        '--random-tags',
         type=int,
-        help='Randomly select n number of tags. Only works if --feed-from-tags is enabled.'
     )
     args = parser.parse_args()
@@ -468,7 +495,7 @@ def main():
     if args.random_tags is not None and args.feed_from_tags is None:
         parser.error("--random-tags can only be used when --feed-from-tags is enabled")
-    print('Loading e621 tag data')
     tagset_normalizer = make_tagset_normalizer()
     # Initialize and load models
@@ -484,7 +511,7 @@ def main():
     image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
     for image_path in Path(args.directory).rglob("*"):
         if image_path.suffix.lower() in image_extensions:
-            caption_file = image_path.with_suffix('.caption')
             # Skip if the caption file already exists
             if caption_file.exists():
@@ -501,29 +528,28 @@ def main():
                 custom_prompt = prompt_from_tags(args, image_path, tagset_normalizer)
             print(f"Custom prompt: {custom_prompt}")
-            continue
             caption = joy_caption_model.process_image(
                 input_image,
                 args.caption_type,
                 args.caption_tone,
                 args.caption_length,
-                custom_prompt=custom_prompt
             )
             # Strip commas if the --dont-strip-commas flag is not set
             if not args.dont_strip_commas:
                 # Existing comma stripping logic
-                caption = re.sub(r',\s*([^\d])', r' \1', caption)
                 # New feature: Add commas after periods if specified
                 if args.add_commas_to_sentence_ends:
-                    caption = re.sub(r'(\.)(\s+)([A-Z])', r'\1,\2\3', caption)
             print(f"Caption for {image_path}:\n\n{caption}\n\n")
             # Save the caption to a .caption file
-            with open(caption_file, 'w', encoding='utf-8') as f:
                 f.write(caption)
             print(f"Caption saved to {caption_file}")
@@ -531,6 +557,7 @@ def main():
 RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
 E6DB_DATA = Path(__file__).resolve().parent / "data"
 def make_tagset_normalizer():
     """
     Create a TagSetNormalizer for encoding/decoding tags to and from integers.
@@ -581,7 +608,6 @@ def make_tagset_normalizer():
     return tagset_normalizer.map_inputs(input_map, on_conflict="ignore")
 def format_nl_list(l):
     n = len(l)
     assert n > 0
@@ -589,36 +615,51 @@ def format_nl_list(l):
         return l[0]
     elif n == 2:
         return f"{l[0]} and {l[1]}"
-    else: # n > 2
         *head, last = l
-        return  ', '.join(head) + ', and ' + last
-TAG_SPECIES = tag_category2id['species']
-TAG_CHARACTER = tag_category2id['character']
-TAG_ARTIST = tag_category2id['artist']
-TAG_COPYRIGHT = tag_category2id['copyright']
-TAG_META = tag_category2id['meta']
 TAG_FREQ_THRESH = 0
 def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer):
     tag_file = find_tag_file(image_path)
     if tag_file is None:
         return None
-    with open(tag_file, 'r', encoding='utf-8') as f:
-        tags = f.read().lower().split(',')
     tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
     encode = tagset_normalizer.tag_normalizer.encode
     # These lists contain tuples (freq, tag, tag_id)
-    tag_by_category = {cat: [] for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]}
-    other_tags = []
-    implied = set()
     for tag in tags:
         tag = tag.strip()
         # Encode the tag into a numerical id
-        tag_id = encode(tag.replace(' ', '_'))
         if tag_id is None:
             other_tags.append((0, tag, None))
             implied.update(tagset_normalizer.implications_rej.get(tag_id, ()))
@@ -633,69 +674,101 @@ def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer
         freq = tag_rank_to_freq(tag_id)
         if freq < TAG_FREQ_THRESH:
             continue
-        tag_by_category.get(cat_id, other_tags).append((freq, tag, tag_id))
-    other_tags = sorted((freq, tag) for freq, tag, tag_id in other_tags if tag_id not in implied)
     for cat_id, cat_list in tag_by_category.items():
-        tag_by_category[cat_id] = sorted((freq, tag) for freq, tag, tag_id in cat_list if tag_id not in implied)
     if args.random_tags is not None:
         # Randomly select tags if --random-tags is specified
         num_tags = min(args.random_tags, len(other_tags))
-        other_tags = random.sample(tags[:round(args.random_tags * 1.5)], num_tags)
     elif args.feed_from_tags > 0:
         # Use specified number of tags if --feed-from-tags has a positive value
-        other_tags = other_tags[:args.feed_from_tags]
     # Prepare sentence pieces
     artist_tag = tag_by_category[TAG_ARTIST]
     if artist_tag:
-        artist_txt = f' by {format_nl_list([tag.removeprefix('by ') for _, tag in artist_tag[:4]])}'
     else:
-        artist_txt = ''
     character_tag = tag_by_category[TAG_CHARACTER]
     if character_tag:
-        character_txt = f' named {format_nl_list([tag for _, tag in character_tag[:4]])}'
     else:
-        character_txt = ''
     species_tag = tag_by_category[TAG_SPECIES]
     if species_tag:
-        species_txt = f' of{" a" if len(character_tag) <= 1 and len(species_tag) <= 1 else ""} {format_nl_list([tag for _, tag in species_tag[:4]])}'
     else:
         if character_tag:
-            species_txt = f' of{" a character" if len(character_tag) <= 1 else " characters"}'
         else:
-            species_txt = ''
     copyright_tag = tag_by_category[TAG_COPYRIGHT]
     if copyright_tag:
-        copyright_txt = f' from {format_nl_list([tag for _, tag in copyright_tag[:4]])}'
     else:
-        copyright_txt = ''
-    tag_string = ', '.join(tag for _, tag in other_tags)
-    custom_prompt = f"Write a descriptive caption for this image{artist_txt}{species_txt}{character_txt}{copyright_txt} in a formal tone. Use these tags as context clues to construct your caption: {tag_string}"
     return custom_prompt
 def find_tag_file(image_path):
     """
     Find the corresponding .txt file for the given image path.
     Handles cases where the image has a -(number) suffix.
     """
     base_name = image_path.stem
-    tag_file = image_path.with_suffix('.txt')
     if tag_file.exists():
         return tag_file
     # Handle -(number) suffix
-    match = re.match(r'(.+)-\d+$', base_name)
     if match:
         base_name = match.group(1)
-        tag_file = image_path.with_name(base_name).with_suffix('.txt')
         if tag_file.exists():
             return tag_file
     return None
 if __name__ == "__main__":
     main()

 import argparse
 import re
 import random
 from pathlib import Path
 from PIL import Image
 import pillow_jxl
 import torchvision.transforms.functional as TVF
 from transformers import (
     AutoModel,
     AutoTokenizer,
     AutoModelForCausalLM,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
 )
 from torch import nn
+from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
+from typing import List, Tuple, Dict
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
         "Write a stable diffusion prompt for this image."
     ],
     ("training_prompt", "formal", False, True): [
+        "Write a stable diffusion prompt for this image within {word_count} " "words."
     ],
     ("training_prompt", "formal", True, False): [
         "Write a {length} stable diffusion prompt for this image."
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 class ImageAdapter(nn.Module):
     """
     Custom image adapter module for processing CLIP vision outputs.
         self.activation = nn.GELU()
         self.linear2 = nn.Linear(output_features, output_features)
         self.ln1 = nn.Identity() if not ln1 else nn.LayerNorm(input_features)
+        self.pos_emb = (
+            None
+            if not pos_emb
+            else nn.Parameter(torch.zeros(num_image_tokens, input_features))
         )
         self.other_tokens = nn.Embedding(3, output_features)
             torch.Tensor: Adapted image features.
         """
         if self.deep_extract:
+            x = torch.concat(
+                (
+                    vision_outputs[-2],
+                    vision_outputs[3],
+                    vision_outputs[7],
+                    vision_outputs[13],
+                    vision_outputs[20],
+                ),
+                dim=-1,
             )
+            assert len(x.shape) == 3, f"Expected 3, got {len(x.shape)}"
+            assert (
+                x.shape[-1] == vision_outputs[-2].shape[-1] * 5
+            ), f"Expected {vision_outputs[-2].shape[-1] * 5}, got {x.shape[-1]}"
         else:
             x = vision_outputs[-2]
         x = self.ln1(x)
         if self.pos_emb is not None:
+            assert (
+                x.shape[-2:] == self.pos_emb.shape
+            ), f"Expected {self.pos_emb.shape}, got {x.shape[-2:]}"
             x = x + self.pos_emb
         x = self.linear1(x)
                 x.shape[0], -1
             )
         )
+        assert other_tokens.shape == (
+            x.shape[0],
+            2,
+            x.shape[2],
+        ), f"Expected {(x.shape[0], 2, x.shape[2])}, got {other_tokens.shape}"
         x = torch.cat((other_tokens[:, 0:1], x, other_tokens[:, 1:2]), dim=1)
         return x
             torch.tensor([2], device=self.other_tokens.weight.device)
         ).squeeze(0)
 class JoyCaptionModel:
     """
     A class for generating captions for images using CLIP, LLM, and custom image adapters.
         if (CHECKPOINT_PATH / "clip_model.pt").exists():
             print("Loading VLM's custom vision model")
+            checkpoint = torch.load(
+                CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
+            )
+            checkpoint = {
+                k.replace("_orig_mod.module.", ""): v for k, v in checkpoint.items()
+            }
             self.clip_model.load_state_dict(checkpoint)
             del checkpoint
         if (CHECKPOINT_PATH / "text_model").exists():
             print("Loading VLM's custom text model")
             self.text_model = AutoModelForCausalLM.from_pretrained(
+                CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16
             )
         else:
             self.text_model = AutoModelForCausalLM.from_pretrained(
+                MODEL_PATH, device_map="auto", torch_dtype=torch.bfloat16
             )
         self.text_model.eval()
             False,
             False,
             38,
+            False,
         )
         self.image_adapter.load_state_dict(
             torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location="cpu")
         self.image_adapter.to("cuda")
     @torch.no_grad()
+    def process_image(
+        self,
+        input_image: Image.Image,
+        caption_type: str,
+        caption_tone: str,
+        caption_length: str | int,
+        custom_prompt: str | None = None,
+    ) -> str:
         """
         Process an input image and generate a caption based on specified parameters.
         """
         if custom_prompt is not None:
             prompt_str = custom_prompt
         else:
+            prompt_str = self._get_prompt_string(
+                caption_type, caption_tone, caption_length
+            )
         print(f"Prompt: {prompt_str}")
         pixel_values = self._preprocess_image(input_image)
         prompt = self._tokenize_prompt(prompt_str)
         embedded_images = self._embed_image(pixel_values)
+        inputs_embeds, input_ids, attention_mask = self._construct_inputs(
+            embedded_images, prompt
+        )
         generate_ids = self._generate_caption(inputs_embeds, input_ids, attention_mask)
         caption = self._decode_caption(generate_ids, input_ids)
             caption_type,
             caption_tone,
             isinstance(length, str),
+            isinstance(length, int),
         )
         if prompt_key not in CAPTION_TYPE_MAP:
             raise ValueError(f"Invalid caption type: {prompt_key}")
         image = input_image.resize((384, 384), Image.LANCZOS)
         pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
         pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+        pixel_values = pixel_values.to("cuda")
         return pixel_values
     def _tokenize_prompt(self, prompt_str):
         prompt = self.tokenizer.encode(
             prompt_str,
+            return_tensors="pt",
             padding=False,
             truncation=False,
+            add_special_tokens=False,
         )
         return prompt
     def _embed_image(self, pixel_values):
+        with torch.amp.autocast_mode.autocast("cuda", enabled=True):
+            vision_outputs = self.clip_model(
+                pixel_values=pixel_values, output_hidden_states=True
+            )
             image_features = vision_outputs.hidden_states
             embedded_images = self.image_adapter(image_features)
+            embedded_images = embedded_images.to("cuda")
         return embedded_images
     def _construct_inputs(self, embedded_images, prompt):
+        prompt_embeds = self.text_model.model.embed_tokens(prompt.to("cuda"))
+        assert prompt_embeds.shape == (
+            1,
+            prompt.shape[1],
+            self.text_model.config.hidden_size,
+        ), (
             f"Prompt shape is {prompt_embeds.shape}, expected "
             f"{(1, prompt.shape[1], self.text_model.config.hidden_size)}"
         )
         embedded_bos = self.text_model.model.embed_tokens(
+            torch.tensor(
+                [[self.tokenizer.bos_token_id]],
+                device=self.text_model.device,
+                dtype=torch.int64,
+            )
+        )
+        eot_embed = (
+            self.image_adapter.get_eot_embedding()
+            .unsqueeze(0)
+            .to(dtype=self.text_model.dtype)
         )
+        inputs_embeds = torch.cat(
+            [
+                embedded_bos.expand(embedded_images.shape[0], -1, -1),
+                embedded_images.to(dtype=embedded_bos.dtype),
+                prompt_embeds.expand(embedded_images.shape[0], -1, -1),
+                eot_embed.expand(embedded_images.shape[0], -1, -1),
+            ],
+            dim=1,
         )
+        input_ids = torch.cat(
+            [
+                torch.tensor([[self.tokenizer.bos_token_id]], dtype=torch.long),
+                torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
+                prompt,
+                torch.tensor([[self.tokenizer.eos_token_id]], dtype=torch.long),
+            ],
+            dim=1,
+        ).to("cuda")
         attention_mask = torch.ones_like(input_ids)
         return inputs_embeds, input_ids, attention_mask
             attention_mask=attention_mask,
             max_new_tokens=300,
             do_sample=True,
+            suppress_tokens=None,
         )
         return generate_ids
     def _decode_caption(self, generate_ids, input_ids):
+        generate_ids = generate_ids[:, input_ids.shape[1] :]
+        if generate_ids[0][-1] == self.tokenizer.eos_token_id or generate_ids[0][
+            -1
+        ] == self.tokenizer.convert_tokens_to_ids("<|eot_id|>"):
             generate_ids = generate_ids[:, :-1]
         caption = self.tokenizer.batch_decode(
+            generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
         )[0]
         return caption
     parser = argparse.ArgumentParser(
         description="Generate captions for images in a directory and save them as .caption files."
     )
+    parser.add_argument(
+        "directory", type=str, help="Target directory containing images."
+    )
     parser.add_argument(
         "--caption_type",
         type=str,
         default="descriptive",
         choices=["descriptive", "training_prompt", "rng-tags", "custom"],
+        help="Type of caption to generate.",
     )
     parser.add_argument(
         "--caption_tone",
         type=str,
         default="formal",
         choices=["formal", "informal"],
+        help="Tone of the caption.",
     )
     parser.add_argument(
+        "--caption_length", type=str, default="any", help="Length of the caption."
     )
     parser.add_argument(
         "--dont-strip-commas",
         action="store_true",
+        help="If set, commas will not be stripped from the generated captions.",
     )
     parser.add_argument(
         "--custom_prompt",
         type=str,
+        help="Custom prompt for the captioner. Use with --caption_type custom.",
     )
     parser.add_argument(
+        "--add-commas-to-sentence-ends",
+        action="store_true",
+        help="Add commas after periods in sentences",
     )
     parser.add_argument(
+        "--feed-from-tags",
         type=int,
+        nargs="?",
         const=-1,
+        help="Use .txt files with the same base filename as the images as input to the captioner. Optionally specify the number of tags to use.",
     )
     parser.add_argument(
+        "--random-tags",
         type=int,
+        help="Randomly select n number of tags. Only works if --feed-from-tags is enabled.",
     )
     args = parser.parse_args()
     if args.random_tags is not None and args.feed_from_tags is None:
         parser.error("--random-tags can only be used when --feed-from-tags is enabled")
+    print("Loading e621 tag data")
     tagset_normalizer = make_tagset_normalizer()
     # Initialize and load models
     image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
     for image_path in Path(args.directory).rglob("*"):
         if image_path.suffix.lower() in image_extensions:
+            caption_file = image_path.with_suffix(".caption")
             # Skip if the caption file already exists
             if caption_file.exists():
                 custom_prompt = prompt_from_tags(args, image_path, tagset_normalizer)
             print(f"Custom prompt: {custom_prompt}")
             caption = joy_caption_model.process_image(
                 input_image,
                 args.caption_type,
                 args.caption_tone,
                 args.caption_length,
+                custom_prompt=custom_prompt,
             )
             # Strip commas if the --dont-strip-commas flag is not set
             if not args.dont_strip_commas:
                 # Existing comma stripping logic
+                caption = re.sub(r",\s*([^\d])", r" \1", caption)
                 # New feature: Add commas after periods if specified
                 if args.add_commas_to_sentence_ends:
+                    caption = re.sub(r"(\.)(\s+)([A-Z])", r"\1,\2\3", caption)
             print(f"Caption for {image_path}:\n\n{caption}\n\n")
             # Save the caption to a .caption file
+            with open(caption_file, "w", encoding="utf-8") as f:
                 f.write(caption)
             print(f"Caption saved to {caption_file}")
 RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
 E6DB_DATA = Path(__file__).resolve().parent / "data"
 def make_tagset_normalizer():
     """
     Create a TagSetNormalizer for encoding/decoding tags to and from integers.
     return tagset_normalizer.map_inputs(input_map, on_conflict="ignore")
 def format_nl_list(l):
     n = len(l)
     assert n > 0
         return l[0]
     elif n == 2:
         return f"{l[0]} and {l[1]}"
+    else:  # n > 2
         *head, last = l
+        return ", ".join(head) + ", and " + last
+TAG_SPECIES = tag_category2id["species"]
+TAG_CHARACTER = tag_category2id["character"]
+TAG_ARTIST = tag_category2id["artist"]
+TAG_COPYRIGHT = tag_category2id["copyright"]
+TAG_META = tag_category2id["meta"]
 TAG_FREQ_THRESH = 0
 def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer):
+    """
+    Generates a prompt from tags associated with the given image.
+    Args:
+        args: Additional arguments for the function.
+        image_path (Path): The path to the image file.
+        tagset_normalizer (TagSetNormalizer): An instance to normalize the tag set.
+    Returns:
+        None
+    """
     tag_file = find_tag_file(image_path)
     if tag_file is None:
         return None
+    with open(tag_file, "r", encoding="utf-8") as f:
+        tags = f.read().lower().split(",")
     tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
     encode = tagset_normalizer.tag_normalizer.encode
     # These lists contain tuples (freq, tag, tag_id)
+    tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
+        cat: [] for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
+    }
+    other_tags: List[Tuple[int, str, int]] = []
+    implied: set = set()
     for tag in tags:
         tag = tag.strip()
         # Encode the tag into a numerical id
+        tag_id = encode(tag.replace(" ", "_"))
         if tag_id is None:
             other_tags.append((0, tag, None))
             implied.update(tagset_normalizer.implications_rej.get(tag_id, ()))
         freq = tag_rank_to_freq(tag_id)
         if freq < TAG_FREQ_THRESH:
             continue
+        tag_by_category.get(cat_id, other_tags).append((int(freq), tag, tag_id))
+    other_tags = sorted(
+        (int(freq), tag, tag_id)
+        for freq, tag, tag_id in other_tags
+        if tag_id not in implied
+    )
     for cat_id, cat_list in tag_by_category.items():
+        tag_by_category[cat_id] = sorted(
+            (int(freq), tag, tag_id)
+            for freq, tag, tag_id in cat_list
+            if tag_id not in implied
+        )
     if args.random_tags is not None:
         # Randomly select tags if --random-tags is specified
         num_tags = min(args.random_tags, len(other_tags))
+        other_tags = random.sample(
+            [
+                (i, tag, tag_id)
+                for i, tag, tag_id in enumerate(tags[: round(args.random_tags * 1.5)])
+            ],
+            num_tags,
+        )
     elif args.feed_from_tags > 0:
         # Use specified number of tags if --feed-from-tags has a positive value
+        other_tags = other_tags[: args.feed_from_tags]
     # Prepare sentence pieces
     artist_tag = tag_by_category[TAG_ARTIST]
     if artist_tag:
+        artist_list = [str(tag).removeprefix('by ')
+                       for *_, tag in artist_tag[:4]]
+        artist_txt = f"by {format_nl_list(artist_list)}"
     else:
+        artist_txt = ""
     character_tag = tag_by_category[TAG_CHARACTER]
     if character_tag:
+        tags = [tag for _, tag, *_ in character_tag[:4]]
+        character_txt = f" named {format_nl_list(tags)}"
     else:
+        character_txt = ""
     species_tag = tag_by_category[TAG_SPECIES]
     if species_tag:
+        species_txt = "of a" if len(character_tag) <= 1 and len(species_tag) <= 1 else "of"
+        species_txt += format_nl_list([tag for *_, tag in species_tag[:4]])
     else:
         if character_tag:
+            species_txt = (
+                " a character"
+                if len(character_tag) <= 1
+                else " characters"
+            )
         else:
+            species_txt = ""
     copyright_tag = tag_by_category[TAG_COPYRIGHT]
     if copyright_tag:
+        tags = [tag for _, tag, *_ in copyright_tag[:4]]
+        copyright_txt = f" from {format_nl_list(tags)}"
     else:
+        copyright_txt = ""
+    tag_string = ", ".join(tag for *_, tag in other_tags)
+    custom_prompt = (
+        f"Write a descriptive caption for this image {artist_txt}"
+        f"of {species_txt}"
+        f"{character_txt}"
+        f"{copyright_txt}"
+        f" in a formal tone. Use these tags to construct your caption: "
+        f"{tag_string}"
+    )
     return custom_prompt
 def find_tag_file(image_path):
     """
     Find the corresponding .txt file for the given image path.
     Handles cases where the image has a -(number) suffix.
     """
     base_name = image_path.stem
+    tag_file = image_path.with_suffix(".txt")
     if tag_file.exists():
         return tag_file
     # Handle -(number) suffix
+    match = re.match(r"(.+)-\d+$", base_name)
     if match:
         base_name = match.group(1)
+        tag_file = image_path.with_name(base_name).with_suffix(".txt")
         if tag_file.exists():
             return tag_file
     return None
 if __name__ == "__main__":
     main()