k4d3
/

toolkit

Safetensors

Model card Files Files and versions Community

k4d3 commited on Oct 3, 2024

Commit

913d039

1 Parent(s): 3461363

joy: some fixes and niceties

Browse files

Files changed (1) hide show

joy +153 -299

joy CHANGED Viewed

@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 """
-JoyCaption Alpha One
 This module provides functionality for generating captions for images using a
 combination of CLIP, LLM, and custom image adapters. It supports various
@@ -34,52 +34,53 @@ from transformers import (
 )
 from torch import nn
 from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path(__file__).resolve().parent / "cgrkzexw-599808"
 CAPTION_TYPE_MAP = {
-    "Descriptive": [
         "Write a descriptive caption for this image in a formal tone.",
         "Write a descriptive caption for this image in a formal tone within {word_count} words.",
         "Write a {length} descriptive caption for this image in a formal tone.",
     ],
-    "Descriptive (Informal)": [
         "Write a descriptive caption for this image in a casual tone.",
         "Write a descriptive caption for this image in a casual tone within {word_count} words.",
         "Write a {length} descriptive caption for this image in a casual tone.",
     ],
-    "Training Prompt": [
         "Write a stable diffusion prompt for this image.",
         "Write a stable diffusion prompt for this image within {word_count} words.",
         "Write a {length} stable diffusion prompt for this image.",
     ],
-    "MidJourney": [
         "Write a MidJourney prompt for this image.",
         "Write a MidJourney prompt for this image within {word_count} words.",
         "Write a {length} MidJourney prompt for this image.",
     ],
-    "Booru tag list": [
         "Write a list of Booru tags for this image.",
         "Write a list of Booru tags for this image within {word_count} words.",
         "Write a {length} list of Booru tags for this image.",
     ],
-    "Booru-like tag list": [
         "Write a list of Booru-like tags for this image.",
         "Write a list of Booru-like tags for this image within {word_count} words.",
         "Write a {length} list of Booru-like tags for this image.",
     ],
-    "Art Critic": [
         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
     ],
-    "Product Listing": [
         "Write a caption for this image as though it were a product listing.",
         "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
         "Write a {length} caption for this image as though it were a product listing.",
     ],
-    "Social Media Post": [
         "Write a caption for this image as if it were being used for a social media post.",
         "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
         "Write a {length} caption for this image as if it were being used for a social media post.",
@@ -208,202 +209,11 @@ class ImageAdapter(nn.Module):
         ).squeeze(0)
-STOP_WORDS: set[str] = {
-    "the",
-    "a",
-    "an",
-    "and",
-    "or",
-    "but",
-    "in",
-    "on",
-    "at",
-    "to",
-    "for",
-    "of",
-    "with",
-    "by",
-    "from",
-    "up",
-    "down",
-    "is",
-    "are",
-    "was",
-    "were",
-    "be",
-    "been",
-    "being",
-    "have",
-    "has",
-    "had",
-    "do",
-    "does",
-    "did",
-    "will",
-    "would",
-    "shall",
-    "should",
-    "can",
-    "could",
-    "may",
-    "might",
-    "must",
-    "ought",
-    "i",
-    "you",
-    "he",
-    "she",
-    "it",
-    "we",
-    "they",
-    "them",
-    "their",
-    "this",
-    "that",
-    "these",
-    "those",
-    "am",
-    "is",
-    "are",
-    "was",
-    "were",
-    "be",
-    "been",
-    "being",
-    "have",
-    "has",
-    "had",
-    "do",
-    "does",
-    "did",
-    "will",
-    "would",
-    "shall",
-    "should",
-    "can",
-    "could",
-    "may",
-    "might",
-    "must",
-    "ought",
-    "i'm",
-    "you're",
-    "he's",
-    "she's",
-    "it's",
-    "we're",
-    "they're",
-    "i've",
-    "you've",
-    "we've",
-    "they've",
-    "i'd",
-    "you'd",
-    "he'd",
-    "she'd",
-    "we'd",
-    "they'd",
-    "i'll",
-    "you'll",
-    "he'll",
-    "she'll",
-    "we'll",
-    "they'll",
-    "isn't",
-    "aren't",
-    "wasn't",
-    "weren't",
-    "hasn't",
-    "haven't",
-    "hadn't",
-    "doesn't",
-    "don't",
-    "didn't",
-    "won't",
-    "wouldn't",
-    "shan't",
-    "shouldn't",
-    "can't",
-    "cannot",
-    "couldn't",
-    "mustn't",
-    "let's",
-    "that's",
-    "who's",
-    "what's",
-    "here's",
-    "there's",
-    "when's",
-    "where's",
-    "why's",
-    "how's",
-    "a",
-    "an",
-    "the",
-    "and",
-    "but",
-    "if",
-    "or",
-    "because",
-    "as",
-    "until",
-    "while",
-    "of",
-    "at",
-    "by",
-    "for",
-    "with",
-    "about",
-    "against",
-    "between",
-    "into",
-    "through",
-    "during",
-    "before",
-    "after",
-    "above",
-    "below",
-    "to",
-    "from",
-    "up",
-    "down",
-    "in",
-    "out",
-    "on",
-    "off",
-    "over",
-    "under",
-    "again",
-    "further",
-    "then",
-    "once",
-    "here",
-    "there",
-    "when",
-    "where",
-    "why",
-    "how",
-    "all",
-    "any",
-    "both",
-    "each",
-    "few",
-    "more",
-    "most",
-    "other",
-    "some",
-    "such",
-    "no",
-    "nor",
-    "not",
-    "only",
-    "own",
-    "same",
-    "so",
-    "than",
-    "too",
-    "very",
-}
 class JoyCaptionModel:
@@ -440,12 +250,12 @@ class JoyCaptionModel:
         """
         Load and initialize all required models (CLIP, LLM, image adapter).
         """
-        print("Loading CLIP")
         self.clip_model = AutoModel.from_pretrained(CLIP_PATH)
         self.clip_model = self.clip_model.vision_model
         if (CHECKPOINT_PATH / "clip_model.pt").exists():
-            print("Loading VLM's custom vision model")
             checkpoint = torch.load(
                 CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
             )
@@ -459,15 +269,15 @@ class JoyCaptionModel:
         self.clip_model.requires_grad_(False)
         self.clip_model.to("cuda")
-        print("Loading tokenizer")
         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
         assert isinstance(
             self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
         )
-        print("Loading LLM")
         if (CHECKPOINT_PATH / "text_model").exists():
-            print("Loading VLM's custom text model")
             self.text_model = AutoModelForCausalLM.from_pretrained(
                 CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16
             )
@@ -478,7 +288,7 @@ class JoyCaptionModel:
         self.text_model.eval()
-        print("Loading image adapter")
         self.image_adapter = ImageAdapter(
             self.clip_model.config.hidden_size,
             self.text_model.config.hidden_size,
@@ -497,9 +307,7 @@ class JoyCaptionModel:
     def process_image(
         self,
         input_image: Image.Image,
-        caption_type: str,
-        caption_length: str | int,
-        custom_prompt: str | None = None,
     ) -> Tuple[str, float]:
         """
         Process an input image and generate a caption based on specified parameters.
@@ -509,12 +317,7 @@ class JoyCaptionModel:
             Tuple[str, float]: The generated caption and its entropy.
         """
         torch.cuda.empty_cache()
-        if custom_prompt is not None:
-            prompt_str = custom_prompt
-        else:
-            prompt_str = self._get_prompt_string(caption_type, caption_length)
-        print(f"Prompt: {prompt_str}")
         pixel_values = self._preprocess_image(input_image)
@@ -535,9 +338,7 @@ class JoyCaptionModel:
     def generate_valid_caption(
         self,
         input_image: Image.Image,
-        caption_type: str,
-        caption_length: str | int,
-        custom_prompt: str | None = None,
         *,
         limited_words: Dict[str, int] = {"fluffy": 2},
         min_sentence_count: int = 3,
@@ -550,9 +351,7 @@ class JoyCaptionModel:
         Args:
             input_image (Image.Image): The input image to caption.
-            caption_type (str): The type of caption to generate.
-            caption_length (str | int): The desired length of the caption.
-            custom_prompt (str | None): A custom prompt for caption generation.
             limited_words (Dict[str, int]): Dictionary of words with their maximum allowed occurrences. Default is {"fluffy": 1}.
             min_sentence_count (int): Minimum required number of sentences. Default is 3.
             max_word_repetitions (int): Maximum allowed repetitions for words longer than 4 characters. Default is 15.
@@ -570,9 +369,7 @@ class JoyCaptionModel:
         - The entropy of the caption is below min_entropy
         """
         while True:
-            caption, entropy = self.process_image(
-                input_image, caption_type, caption_length, custom_prompt
-            )
             words = re.findall(r"\b\w+\b", caption.lower())
             word_counts = {
                 word: words.count(word) for word in set(words) if word not in stop_words
@@ -580,11 +377,11 @@ class JoyCaptionModel:
             sentence_count = len(re.findall(r"[.!?]", caption))
             if not re.search(r"\w", caption):
-                print(
                     f"Retrying: Caption contains only special characters.\nCaption: {caption!r}"
                 )
             elif caption[-1] not in {".", "!", "?"}:
-                print(
                     f"Retrying: Caption does not end with proper punctuation.\nCaption: {caption!r}"
                 )
             elif any(
@@ -596,7 +393,7 @@ class JoyCaptionModel:
                     for word, max_count in limited_words.items()
                     if caption.lower().count(word) > max_count
                 ]
-                print(
                     f"Retrying: Limited words exceeded: {', '.join(exceeded_words)}.\nCaption: {caption!r}"
                 )
             elif any(
@@ -609,21 +406,22 @@ class JoyCaptionModel:
                     for word, count in word_counts.items()
                     if count > max_word_repetitions and len(word) > 4
                 ]
-                print(
                     f"Retrying: Words repeated more than {max_word_repetitions} times: {', '.join(repeated_words)}.\nCaption: {caption!r}"
                 )
             elif sentence_count < min_sentence_count:
-                print(
                     f"Retrying: Only {sentence_count} sentences (min: {min_sentence_count}).\nCaption: {caption!r}"
                 )
             elif entropy < min_entropy:
-                print(
                     f"Retrying: Low entropy ({entropy:.2f} < {min_entropy}).\nCaption: {caption!r}"
                 )
             else:
                 return caption
-    def _get_prompt_string(self, caption_type, caption_length):
         length = None if caption_length == "any" else caption_length
         if isinstance(length, str):
@@ -642,13 +440,16 @@ class JoyCaptionModel:
         else:
             raise ValueError(f"Invalid caption length: {length}")
         if caption_type not in CAPTION_TYPE_MAP:
             raise ValueError(f"Invalid caption type: {caption_type}")
         prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
         return prompt_str
-    def _preprocess_image(self, input_image: Image.Image) -> torch.Tensor:
         """
         Preprocess the input image for the CLIP model.
@@ -703,6 +504,7 @@ class JoyCaptionModel:
         convo_string = self.tokenizer.apply_chat_template(
             convo, tokenize=False, add_generation_prompt=True
         )
         convo_tokens = self.tokenizer.encode(
             convo_string,
             return_tensors="pt",
@@ -756,7 +558,7 @@ class JoyCaptionModel:
             input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            max_new_tokens=300,
             do_sample=True,
             suppress_tokens=None,
             repetition_penalty=1.2,
@@ -800,6 +602,38 @@ class JoyCaptionModel:
         return entropy
 def main():
     """
     Generate captions for images in a directory
@@ -818,7 +652,7 @@ def main():
         "--caption_type",
         type=str,
         default="descriptive",
-        choices=["descriptive", "training_prompt", "rng-tags", "custom"],
         help="Type of caption to generate.",
     )
     parser.add_argument(
@@ -858,25 +692,37 @@ def main():
             "Only works if --feed-from-tags is enabled."
         ),
     )
     args = parser.parse_args()
     # Validate random-tags usage
     if args.random_tags is not None and args.feed_from_tags is None:
         parser.error("--random-tags can only be used when --feed-from-tags is enabled")
-    print("Loading e621 tag data")
-    tagset_normalizer = make_tagset_normalizer()
-    # Initialize and load models
-    joy_caption_model = JoyCaptionModel()
-    joy_caption_model.load_models()
-    # Validate custom prompt usage
-    if args.caption_type == "custom" and not args.custom_prompt:
-        parser.error("--custom_prompt is required when using --caption_type custom")
-    elif args.caption_type != "custom" and args.custom_prompt:
-        parser.error("--custom_prompt can only be used with --caption_type custom")
     image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
     for image_path in Path(args.directory).rglob("*"):
@@ -885,31 +731,27 @@ def main():
             # Skip if the caption file already exists
             if caption_file.exists():
-                print(f"Skipping {image_path}: Caption file already exists.")
                 continue
-            input_image = Image.open(image_path).convert("RGB")
             # Use custom prompt if specified
-            custom_prompt = None
-            if args.caption_type == "custom":
-                custom_prompt = args.custom_prompt
-            elif args.feed_from_tags is not None:
-                base_prompt = joy_caption_model._get_prompt_string(
-                    args.caption_type, args.caption_length
-                )
-                custom_prompt = prompt_from_tags(
-                    args, image_path, tagset_normalizer, base_prompt
-                )
-            print(f"\nCustom prompt: {custom_prompt}")
-            caption = joy_caption_model.generate_valid_caption(
-                input_image,
-                args.caption_type,
-                args.caption_length,
-                custom_prompt=custom_prompt,
-            )
             # Strip commas if the --dont-strip-commas flag is not set
             if not args.dont_strip_commas:
@@ -923,12 +765,12 @@ def main():
             # Remove all newline characters
             caption = caption.replace("\n", " ")
-            print(f"Caption for {image_path}:\n\n{caption}\n\n")
             # Save the caption to a .caption file
             with open(caption_file, "w", encoding="utf-8") as f:
                 f.write(caption)
-            print(f"Caption saved to {caption_file}")
 RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
@@ -1005,7 +847,6 @@ TAG_CHARACTER = tag_category2id["character"]
 TAG_ARTIST = tag_category2id["artist"]
 TAG_COPYRIGHT = tag_category2id["copyright"]
 TAG_META = tag_category2id["meta"]
-TAG_FREQ_THRESH = 0
 def prompt_from_tags(
@@ -1013,6 +854,8 @@ def prompt_from_tags(
     image_path: Path,
     tagset_normalizer: TagSetNormalizer,
     base_prompt: str = "Write a descriptive caption for this image in a formal tone.",
 ):
     """
     Generates a prompt from tags associated with the given image.
@@ -1023,31 +866,35 @@ def prompt_from_tags(
             The path to the image file.
         tagset_normalizer (TagSetNormalizer):
             An instance to normalize the tag set.
-    Returns:
-        None
     """
     tag_file = find_tag_file(image_path)
     if tag_file is None:
-        return None
     with open(tag_file, "r", encoding="utf-8") as f:
         tags = f.read().lower().split(",")
     tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
     encode = tagset_normalizer.tag_normalizer.encode
-    # These lists contain tuples (freq, tag, tag_id)
     tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
         cat: [] for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
     }
     other_tags: List[Tuple[int, str, int]] = []
     implied: set = set()
     for tag in tags:
         tag = tag.strip()
         # Encode the tag into a numerical id
         tag_id = encode(tag.replace(" ", "_"))
         if tag_id is None:
             other_tags.append((0, tag, 0))
             implied.update(tagset_normalizer.implications_rej.get(0, ()))
             continue
@@ -1056,26 +903,29 @@ def prompt_from_tags(
         # Skip meta tags
         if cat_id == TAG_META:
             continue
         implied.update(tagset_normalizer.implications.get(tag_id, ()))
         # Get the frequency of the tag
         freq = tag_rank_to_freq(tag_id)
-        if freq < TAG_FREQ_THRESH:
             continue
         tag_by_category.get(cat_id, other_tags).append((int(freq), tag, tag_id))
     other_tags = sorted(
-        (int(freq), tag, tag_id)
         for freq, tag, tag_id in other_tags
         if tag_id not in implied
     )
     for cat_id, cat_list in tag_by_category.items():
         tag_by_category[cat_id] = sorted(
-            (int(freq), tag, tag_id)
-            for freq, tag, tag_id in cat_list
-            if tag_id not in implied
         )
     if args.random_tags is not None:
         # Randomly select tags if --random-tags is specified
         num_tags = min(args.random_tags, len(other_tags))
@@ -1090,7 +940,7 @@ def prompt_from_tags(
         # Use specified number of tags if --feed-from-tags has a positive value
         other_tags = other_tags[: args.feed_from_tags]
-    # Prepare sentence pieces
     artist_tag = tag_by_category[TAG_ARTIST]
     if artist_tag:
         artist_list = [str(tp[1]).removeprefix("by ") for tp in artist_tag[:4]]
@@ -1113,7 +963,9 @@ def prompt_from_tags(
         species_txt += format_nl_list([tp[1] for tp in species_tag[:4]])
     else:
         if character_tag:
-            species_txt = " a character" if len(character_tag) <= 1 else " characters"
         else:
             species_txt = ""
@@ -1123,8 +975,11 @@ def prompt_from_tags(
         copyright_txt = f"from {format_nl_list(tags)}"
     else:
         copyright_txt = ""
     tag_string = ", ".join(tp[1] for tp in other_tags)
     image_pos = base_prompt.find("image")
     if image_pos < 0:
         raise ValueError("Base prompt must contain the word 'image'")
@@ -1132,20 +987,19 @@ def prompt_from_tags(
     base_prompt_prefix = base_prompt[:image_pos].rstrip()
     base_prompt_suffix = base_prompt[image_pos:].lstrip()
-    custom_prompt = " ".join(
-        s
-        for s in [
-            base_prompt_prefix,
-            artist_txt,
-            species_txt,
-            character_txt,
-            copyright_txt,
-            base_prompt_suffix,
-            "Use these tags to construct your caption:",
-            tag_string,
-        ]
-        if s
-    )
     return custom_prompt

 # -*- coding: utf-8 -*-
 """
+JoyCaption Alpha Two
 This module provides functionality for generating captions for images using a
 combination of CLIP, LLM, and custom image adapters. It supports various
 )
 from torch import nn
 from e6db_reader import TagSetNormalizer, tag_category2id, tag_rank_to_freq
+import logging
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path(__file__).resolve().parent / "cgrkzexw-599808"
 CAPTION_TYPE_MAP = {
+    "descriptive": [
         "Write a descriptive caption for this image in a formal tone.",
         "Write a descriptive caption for this image in a formal tone within {word_count} words.",
         "Write a {length} descriptive caption for this image in a formal tone.",
     ],
+    "descriptive (informal)": [
         "Write a descriptive caption for this image in a casual tone.",
         "Write a descriptive caption for this image in a casual tone within {word_count} words.",
         "Write a {length} descriptive caption for this image in a casual tone.",
     ],
+    "training prompt": [
         "Write a stable diffusion prompt for this image.",
         "Write a stable diffusion prompt for this image within {word_count} words.",
         "Write a {length} stable diffusion prompt for this image.",
     ],
+    "midjourney": [
         "Write a MidJourney prompt for this image.",
         "Write a MidJourney prompt for this image within {word_count} words.",
         "Write a {length} MidJourney prompt for this image.",
     ],
+    "booru tag list": [
         "Write a list of Booru tags for this image.",
         "Write a list of Booru tags for this image within {word_count} words.",
         "Write a {length} list of Booru tags for this image.",
     ],
+    "booru-like tag list": [
         "Write a list of Booru-like tags for this image.",
         "Write a list of Booru-like tags for this image within {word_count} words.",
         "Write a {length} list of Booru-like tags for this image.",
     ],
+    "art critic": [
         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc.",
         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it within {word_count} words.",
         "Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it {length}.",
     ],
+    "product listing": [
         "Write a caption for this image as though it were a product listing.",
         "Write a caption for this image as though it were a product listing. Keep it under {word_count} words.",
         "Write a {length} caption for this image as though it were a product listing.",
     ],
+    "social media post": [
         "Write a caption for this image as if it were being used for a social media post.",
         "Write a caption for this image as if it were being used for a social media post. Limit the caption to {word_count} words.",
         "Write a {length} caption for this image as if it were being used for a social media post.",
         ).squeeze(0)
+STOP_WORDS: set[str] = set(
+    "i'll if we'd can't you'd shouldn't i'd only doesn't further isn't didn't has more aren't during do than were he's too here you against could few for ought won't we until weren't i've they're same up she but are how here's their over can under mustn't while on by had and an each he'd he about she'd am was she'll where's did out or that's it they'd a let's shall what's the to don't when below no any some from is hadn't all they i'm must in before who's own where you've that very them this not because it's shan't wasn't you'll when's most off i at other hasn't nor been such again we'll down above will so should into she's once have these why's be we've as being why those then with after may you're would haven't both wouldn't there cannot they've couldn't how's between does we're through he'll of there's they'll might".split(
+        " "
+    )
+)
 class JoyCaptionModel:
         """
         Load and initialize all required models (CLIP, LLM, image adapter).
         """
+        logging.info("Loading CLIP")
         self.clip_model = AutoModel.from_pretrained(CLIP_PATH)
         self.clip_model = self.clip_model.vision_model
         if (CHECKPOINT_PATH / "clip_model.pt").exists():
+            logging.info("Loading VLM's custom vision model")
             checkpoint = torch.load(
                 CHECKPOINT_PATH / "clip_model.pt", map_location="cpu"
             )
         self.clip_model.requires_grad_(False)
         self.clip_model.to("cuda")
+        logging.info("Loading tokenizer")
         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
         assert isinstance(
             self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
         )
+        logging.info("Loading LLM")
         if (CHECKPOINT_PATH / "text_model").exists():
+            logging.info("Loading VLM's custom text model")
             self.text_model = AutoModelForCausalLM.from_pretrained(
                 CHECKPOINT_PATH / "text_model", device_map=0, torch_dtype=torch.bfloat16
             )
         self.text_model.eval()
+        logging.info("Loading image adapter")
         self.image_adapter = ImageAdapter(
             self.clip_model.config.hidden_size,
             self.text_model.config.hidden_size,
     def process_image(
         self,
         input_image: Image.Image,
+        prompt_str: str,
     ) -> Tuple[str, float]:
         """
         Process an input image and generate a caption based on specified parameters.
             Tuple[str, float]: The generated caption and its entropy.
         """
         torch.cuda.empty_cache()
+        logging.info(f"Prompt: {prompt_str}")
         pixel_values = self._preprocess_image(input_image)
     def generate_valid_caption(
         self,
         input_image: Image.Image,
+        prompt: str,
         *,
         limited_words: Dict[str, int] = {"fluffy": 2},
         min_sentence_count: int = 3,
         Args:
             input_image (Image.Image): The input image to caption.
+            prompt (str | None): Prompt for caption generation.
             limited_words (Dict[str, int]): Dictionary of words with their maximum allowed occurrences. Default is {"fluffy": 1}.
             min_sentence_count (int): Minimum required number of sentences. Default is 3.
             max_word_repetitions (int): Maximum allowed repetitions for words longer than 4 characters. Default is 15.
         - The entropy of the caption is below min_entropy
         """
         while True:
+            caption, entropy = self.process_image(input_image, prompt)
             words = re.findall(r"\b\w+\b", caption.lower())
             word_counts = {
                 word: words.count(word) for word in set(words) if word not in stop_words
             sentence_count = len(re.findall(r"[.!?]", caption))
             if not re.search(r"\w", caption):
+                logging.info(
                     f"Retrying: Caption contains only special characters.\nCaption: {caption!r}"
                 )
             elif caption[-1] not in {".", "!", "?"}:
+                logging.info(
                     f"Retrying: Caption does not end with proper punctuation.\nCaption: {caption!r}"
                 )
             elif any(
                     for word, max_count in limited_words.items()
                     if caption.lower().count(word) > max_count
                 ]
+                logging.info(
                     f"Retrying: Limited words exceeded: {', '.join(exceeded_words)}.\nCaption: {caption!r}"
                 )
             elif any(
                     for word, count in word_counts.items()
                     if count > max_word_repetitions and len(word) > 4
                 ]
+                logging.info(
                     f"Retrying: Words repeated more than {max_word_repetitions} times: {', '.join(repeated_words)}.\nCaption: {caption!r}"
                 )
             elif sentence_count < min_sentence_count:
+                logging.info(
                     f"Retrying: Only {sentence_count} sentences (min: {min_sentence_count}).\nCaption: {caption!r}"
                 )
             elif entropy < min_entropy:
+                logging.info(
                     f"Retrying: Low entropy ({entropy:.2f} < {min_entropy}).\nCaption: {caption!r}"
                 )
             else:
                 return caption
+    @staticmethod
+    def get_prompt_string(caption_type, caption_length):
         length = None if caption_length == "any" else caption_length
         if isinstance(length, str):
         else:
             raise ValueError(f"Invalid caption length: {length}")
+        caption_type = caption_type.lower()
         if caption_type not in CAPTION_TYPE_MAP:
             raise ValueError(f"Invalid caption type: {caption_type}")
         prompt_str = CAPTION_TYPE_MAP[caption_type][map_idx]
+        prompt_str = prompt_str.format(length=caption_length, word_count=caption_length)
         return prompt_str
+    @staticmethod
+    def _preprocess_image(input_image: Image.Image) -> torch.Tensor:
         """
         Preprocess the input image for the CLIP model.
         convo_string = self.tokenizer.apply_chat_template(
             convo, tokenize=False, add_generation_prompt=True
         )
+        logging.debug(f"Convo:\n{convo_string}")
         convo_tokens = self.tokenizer.encode(
             convo_string,
             return_tensors="pt",
             input_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
+            max_new_tokens=512,
             do_sample=True,
             suppress_tokens=None,
             repetition_penalty=1.2,
         return entropy
+class ColoredFormatter(logging.Formatter):
+    COLORS = {
+        "DEBUG": "\033[36m",  # Cyan
+        "INFO": "\033[32m",  # Green
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
+        "CRITICAL": "\033[31;1m",  # Bright Red
+    }
+    RESET = "\033[0m"
+    def format(self, record):
+        log_message = super().format(record)
+        return f"{self.COLORS.get(record.levelname, '')}{log_message}{self.RESET}"
+def setup_logging(verbosity):
+    if verbosity == 0:
+        log_level = logging.INFO
+    elif verbosity == 1:
+        log_level = logging.DEBUG
+    handler = logging.StreamHandler()
+    formatter = ColoredFormatter(
+        fmt="%(asctime)s | %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    handler.setFormatter(formatter)
+    logger = logging.getLogger()
+    logger.setLevel(log_level)
+    logger.addHandler(handler)
 def main():
     """
     Generate captions for images in a directory
         "--caption_type",
         type=str,
         default="descriptive",
+        choices=CAPTION_TYPE_MAP.keys(),
         help="Type of caption to generate.",
     )
     parser.add_argument(
             "Only works if --feed-from-tags is enabled."
         ),
     )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Run in dry-run mode without loading models or generating captions.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="count",
+        default=0,
+        help="Increase output verbosity (can be repeated)",
+    )
     args = parser.parse_args()
+    setup_logging(args.verbose)
     # Validate random-tags usage
     if args.random_tags is not None and args.feed_from_tags is None:
         parser.error("--random-tags can only be used when --feed-from-tags is enabled")
+    if args.feed_from_tags is not None:
+        logging.info("Loading e621 tag data")
+        tagset_normalizer = make_tagset_normalizer()
+    # Initialize and load models only if not in dry-run mode
+    if not args.dry_run:
+        joy_caption_model = JoyCaptionModel()
+        joy_caption_model.load_models()
+    else:
+        logging.info("Running in dry-run mode. Models will not be loaded.")
     image_extensions = {".webp", ".png", ".jpeg", ".jpg", ".jxl"}
     for image_path in Path(args.directory).rglob("*"):
             # Skip if the caption file already exists
             if caption_file.exists():
+                logging.info(f"Skipping {image_path}: Caption file already exists.")
                 continue
+            if not args.dry_run:
+                input_image = Image.open(image_path).convert("RGB")
             # Use custom prompt if specified
+            prompt = args.custom_prompt or JoyCaptionModel.get_prompt_string(
+                args.caption_type, args.caption_length
+            )
+            if args.feed_from_tags is not None:
+                prompt = prompt_from_tags(args, image_path, tagset_normalizer, prompt)
+            if args.dry_run:
+                logging.info(
+                    f"Dry run: Skipping caption generation for {image_path} with prompt:\n\t{prompt}"
+                )
+                continue
+            caption = joy_caption_model.generate_valid_caption(input_image, prompt)
             # Strip commas if the --dont-strip-commas flag is not set
             if not args.dont_strip_commas:
             # Remove all newline characters
             caption = caption.replace("\n", " ")
+            logging.info(f"Caption for {image_path}:\n\t{caption}\n\n")
             # Save the caption to a .caption file
             with open(caption_file, "w", encoding="utf-8") as f:
                 f.write(caption)
+            logging.info(f"Caption saved to {caption_file}")
 RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
 TAG_ARTIST = tag_category2id["artist"]
 TAG_COPYRIGHT = tag_category2id["copyright"]
 TAG_META = tag_category2id["meta"]
 def prompt_from_tags(
     image_path: Path,
     tagset_normalizer: TagSetNormalizer,
     base_prompt: str = "Write a descriptive caption for this image in a formal tone.",
+    tag_freq_threshold: int = 0,
+    tag_string_prefix: str = "Use these tags to construct your caption:",
 ):
     """
     Generates a prompt from tags associated with the given image.
             The path to the image file.
         tagset_normalizer (TagSetNormalizer):
             An instance to normalize the tag set.
     """
+    # Find and read the corresponding tag file
     tag_file = find_tag_file(image_path)
     if tag_file is None:
+        logging.warning(f"No tag file found for {image_path}")
+        return base_prompt
     with open(tag_file, "r", encoding="utf-8") as f:
         tags = f.read().lower().split(",")
+    # Get helper functions from the tagset_normalizer
     tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
     encode = tagset_normalizer.tag_normalizer.encode
+    # Initialize dictionaries and lists to store categorized tags
+    # These lists will contain tuples (freq, tag, tag_id)
     tag_by_category: Dict[int, List[Tuple[int, str, int]]] = {
         cat: [] for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]
     }
     other_tags: List[Tuple[int, str, int]] = []
     implied: set = set()
+    # Process each tag
     for tag in tags:
         tag = tag.strip()
         # Encode the tag into a numerical id
         tag_id = encode(tag.replace(" ", "_"))
         if tag_id is None:
+            # If tag is not recognized, add it to other_tags
             other_tags.append((0, tag, 0))
             implied.update(tagset_normalizer.implications_rej.get(0, ()))
             continue
         # Skip meta tags
         if cat_id == TAG_META:
             continue
+        # Update implied tags
         implied.update(tagset_normalizer.implications.get(tag_id, ()))
         # Get the frequency of the tag
         freq = tag_rank_to_freq(tag_id)
+        if freq < tag_freq_threshold:
             continue
+        # Add the tag to its category, or other_tags
         tag_by_category.get(cat_id, other_tags).append((int(freq), tag, tag_id))
+    # Sort other_tags by frequency (descending) and filter out implied tags
     other_tags = sorted(
+        (-freq, tag, tag_id)
         for freq, tag, tag_id in other_tags
         if tag_id not in implied
     )
+    # Sort tags within each category, prefering non implied tags
     for cat_id, cat_list in tag_by_category.items():
         tag_by_category[cat_id] = sorted(
+            ((tag_id in implied, -freq), tag, tag_id) for freq, tag, tag_id in cat_list
         )
+    # Handle random tag selection or tag limit if specified
     if args.random_tags is not None:
         # Randomly select tags if --random-tags is specified
         num_tags = min(args.random_tags, len(other_tags))
         # Use specified number of tags if --feed-from-tags has a positive value
         other_tags = other_tags[: args.feed_from_tags]
+    # Prepare sentence pieces for each category
     artist_tag = tag_by_category[TAG_ARTIST]
     if artist_tag:
         artist_list = [str(tp[1]).removeprefix("by ") for tp in artist_tag[:4]]
         species_txt += format_nl_list([tp[1] for tp in species_tag[:4]])
     else:
         if character_tag:
+            species_txt = (
+                "of a character" if len(character_tag) <= 1 else "of characters"
+            )
         else:
             species_txt = ""
         copyright_txt = f"from {format_nl_list(tags)}"
     else:
         copyright_txt = ""
+    # Prepare the remaining tags as a string
     tag_string = ", ".join(tp[1] for tp in other_tags)
+    # Extract the prefix and suffix around the word "image" from the base prompt
     image_pos = base_prompt.find("image")
     if image_pos < 0:
         raise ValueError("Base prompt must contain the word 'image'")
     base_prompt_prefix = base_prompt[:image_pos].rstrip()
     base_prompt_suffix = base_prompt[image_pos:].lstrip()
+    pieces = [
+        base_prompt_prefix,
+        artist_txt,
+        species_txt,
+        character_txt,
+        copyright_txt,
+        base_prompt_suffix,
+        tag_string_prefix,
+        tag_string,
+    ]
+    logging.debug("Prompt pieces: %r", pieces)
+    custom_prompt = " ".join(p for p in pieces if p)
+    custom_prompt = custom_prompt.replace(" .", ".").replace(" ,", ",")
     return custom_prompt