joy: use tag categories for building prompts

Browse files

Files changed (7) hide show

.gitignore +2 -0
__pycache__/e6db_reader.cpython-312.pyc +0 -0
data/implications.json.gz +3 -0
data/implications_rej.json.gz +3 -0
demo.py +0 -7
e6db_reader.py +1 -1
joy +169 -53

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+

__pycache__/e6db_reader.cpython-312.pyc DELETED Viewed

Binary file (16.5 kB)

data/implications.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6240e9f23bacc42bcfccdfa3e86a439a4b0a489ee142c9ac855f12027c0657e7
+size 228337

data/implications_rej.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33c9a2d1ed8f60d6f6122a6b2a816043de2ad029ca0fc2ad39f1fbd705e6beaa
+size 94416

demo.py DELETED Viewed

@@ -1,7 +0,0 @@
-from e6db_reader import TagNormalizer, tag_categories, tag_category2id
-tn = TagNormalizer('data')
-tn.map_inputs(lambda tag, tid: tag.replace('_', ' '))
-for tag in ['pokemon', 'pikachu', 'charizard', 'loona']:
-    print(tag, tn.get_category(tag))

e6db_reader.py CHANGED Viewed

@@ -93,7 +93,7 @@ def load_implications(data_dir):
 def tag_rank_to_freq(rank: int) -> float:
     """Approximate the frequency of a tag given its rank"""
-    return math.exp(26.4284 * math.tanh(2.93505 * rank ** (-0.136501)) - 11.492)
 def tag_freq_to_rank(freq: int) -> float:

 def tag_rank_to_freq(rank: int) -> float:
     """Approximate the frequency of a tag given its rank"""
+    return math.exp(26.4284 * math.tanh(2.93505 * max(1, rank) ** (-0.136501)) - 11.492)
 def tag_freq_to_rank(freq: int) -> float:

joy CHANGED Viewed

@@ -18,6 +18,7 @@ import os
 import argparse
 import re
 import random
 from pathlib import Path
 from PIL import Image
 import pillow_jxl
@@ -32,7 +33,7 @@ from transformers import (
     PreTrainedTokenizerFast,
 )
 from torch import nn
-from e6db_reader import TagNormalizer
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
@@ -81,8 +82,6 @@ CAPTION_TYPE_MAP = {
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-E6DB_DATA = Path(__file__).resolve().parent / "data"
 class ImageAdapter(nn.Module):
     """
     Custom image adapter module for processing CLIP vision outputs.
@@ -275,13 +274,13 @@ class JoyCaptionModel:
                      caption_type: str,
                      caption_tone: str,
                      caption_length: str | int,
-                     custom_prompt: str = None) -> str:
         """
         Process an input image and generate a caption based on specified parameters.
         """
         torch.cuda.empty_cache()
-        if caption_type == "custom" and custom_prompt:
             prompt_str = custom_prompt
         else:
             prompt_str = self._get_prompt_string(caption_type, caption_tone, caption_length)
@@ -470,7 +469,7 @@ def main():
         parser.error("--random-tags can only be used when --feed-from-tags is enabled")
     print('Loading e621 tag data')
-    tag_normalizer = TagNormalizer(E6DB_DATA)
     # Initialize and load models
     joy_caption_model = JoyCaptionModel()
@@ -495,54 +494,22 @@ def main():
             input_image = Image.open(image_path).convert("RGB")
             # Use custom prompt if specified
             if args.caption_type == "custom":
-                caption = joy_caption_model.process_image(
-                    input_image,
-                    "custom",
-                    args.caption_tone,
-                    args.caption_length,
-                    custom_prompt=args.custom_prompt
-                )
-            else:
-                # Check for --feed-from-tags
-                if args.feed_from_tags is not None:
-                    tag_file = find_tag_file(image_path)
-                    if tag_file:
-                        with open(tag_file, 'r', encoding='utf-8') as f:
-                            tags = f.read().strip().split(',')
-                        if args.random_tags is not None:
-                            # Randomly select tags if --random-tags is specified
-                            num_tags = min(args.random_tags, len(tags))
-                            tags = random.sample(tags, num_tags)
-                        elif args.feed_from_tags > 0:
-                            # Use specified number of tags if --feed-from-tags has a positive value
-                            tags = tags[:args.feed_from_tags]
-                        tag_string = ', '.join(tags)
-                        custom_prompt = f"Write a descriptive caption for this image in a formal tone. Use these tags as context clues to construct your caption: {tag_string}"
-                        caption = joy_caption_model.process_image(
-                            input_image,
-                            "custom",
-                            args.caption_tone,
-                            args.caption_length,
-                            custom_prompt=custom_prompt
-                        )
-                    else:
-                        caption = joy_caption_model.process_image(
-                            input_image,
-                            args.caption_type,
-                            args.caption_tone,
-                            args.caption_length
-                        )
-                else:
-                    caption = joy_caption_model.process_image(
-                        input_image,
-                        args.caption_type,
-                        args.caption_tone,
-                        args.caption_length
-                    )
             # Strip commas if the --dont-strip-commas flag is not set
             if not args.dont_strip_commas:
@@ -560,6 +527,155 @@ def main():
                 f.write(caption)
             print(f"Caption saved to {caption_file}")
 def find_tag_file(image_path):
     """
     Find the corresponding .txt file for the given image path.

 import argparse
 import re
 import random
+from collections import Counter
 from pathlib import Path
 from PIL import Image
 import pillow_jxl
     PreTrainedTokenizerFast,
 )
 from torch import nn
+from e6db_reader import TagNormalizer, TagSetNormalizer, tag_category2id, tag_rank_to_freq
 CLIP_PATH = "google/siglip-so400m-patch14-384"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 class ImageAdapter(nn.Module):
     """
     Custom image adapter module for processing CLIP vision outputs.
                      caption_type: str,
                      caption_tone: str,
                      caption_length: str | int,
+                     custom_prompt: str | None = None) -> str:
         """
         Process an input image and generate a caption based on specified parameters.
         """
         torch.cuda.empty_cache()
+        if custom_prompt is not None:
             prompt_str = custom_prompt
         else:
             prompt_str = self._get_prompt_string(caption_type, caption_tone, caption_length)
         parser.error("--random-tags can only be used when --feed-from-tags is enabled")
     print('Loading e621 tag data')
+    tagset_normalizer = make_tagset_normalizer()
     # Initialize and load models
     joy_caption_model = JoyCaptionModel()
             input_image = Image.open(image_path).convert("RGB")
             # Use custom prompt if specified
+            custom_prompt = None
             if args.caption_type == "custom":
+                custom_prompt = args.custom_prompt
+            elif args.feed_from_tags is not None:
+                custom_prompt = prompt_from_tags(args, image_path, tagset_normalizer)
+            print(f"Custom prompt: {custom_prompt}")
+            continue
+            caption = joy_caption_model.process_image(
+                input_image,
+                args.caption_type,
+                args.caption_tone,
+                args.caption_length,
+                custom_prompt=custom_prompt
+            )
             # Strip commas if the --dont-strip-commas flag is not set
             if not args.dont_strip_commas:
                 f.write(caption)
             print(f"Caption saved to {caption_file}")
+RE_PARENS_SUFFIX = re.compile(r"_\([^)]+\)$")
+E6DB_DATA = Path(__file__).resolve().parent / "data"
+def make_tagset_normalizer():
+    """
+    Create a TagSetNormalizer for encoding/decoding tags to and from integers.
+    Configures it based on the provided config.
+    """
+    # This loads all the aliases and implications
+    tagset_normalizer = TagSetNormalizer(E6DB_DATA)
+    tagid2cat = tagset_normalizer.tag_normalizer.tag_categories
+    cat_artist = tag_category2id["artist"]
+    cat2suffix = {
+        tag_category2id["character"]: "_(character)",
+        tag_category2id["lore"]: "_(lore)",
+        tag_category2id["species"]: "_(species)",
+        tag_category2id["copyright"]: "_(copyright)",
+    }
+    # Create additional aliases for tags using simple rules
+    def input_map(tag, tid):
+        # Make an alias without parentheses, it might conflict but we'll handle
+        # it depending on `on_alias_conflict` config value.
+        without_suffix = RE_PARENS_SUFFIX.sub("", tag)
+        had_suffix = tag != without_suffix
+        if had_suffix:
+            yield without_suffix
+        # Add an alias with the suffix (special case for artist)
+        cat = tagid2cat[tid] if tid is not None else -1
+        if cat == cat_artist:
+            artist = without_suffix.removeprefix("by_")
+            if artist != without_suffix:
+                yield artist
+                if not had_suffix:
+                    yield f"{artist}_(artist)"
+            else:
+                yield f"by_{artist}"
+                if not had_suffix:
+                    yield f"by_{artist}_(artist)"
+        elif not had_suffix:
+            suffix = cat2suffix.get(cat)
+            if suffix is not None:
+                yield f"{without_suffix}{suffix}"
+        # Recognize tags where ':' were replaced by a space (aspect ratio)
+        if ":" in tag:
+            yield tag.replace(":", "_")
+    return tagset_normalizer.map_inputs(input_map, on_conflict="ignore")
+def format_nl_list(l):
+    n = len(l)
+    assert n > 0
+    if n == 1:
+        return l[0]
+    elif n == 2:
+        return f"{l[0]} and {l[1]}"
+    else: # n > 2
+        *head, last = l
+        return  ', '.join(head) + ', and ' + last
+TAG_SPECIES = tag_category2id['species']
+TAG_CHARACTER = tag_category2id['character']
+TAG_ARTIST = tag_category2id['artist']
+TAG_COPYRIGHT = tag_category2id['copyright']
+TAG_META = tag_category2id['meta']
+TAG_FREQ_THRESH = 0
+def prompt_from_tags(args, image_path: Path, tagset_normalizer: TagSetNormalizer):
+    tag_file = find_tag_file(image_path)
+    if tag_file is None:
+        return None
+    with open(tag_file, 'r', encoding='utf-8') as f:
+        tags = f.read().lower().split(',')
+    tag_id_to_cat_id = tagset_normalizer.tag_normalizer.tag_categories
+    encode = tagset_normalizer.tag_normalizer.encode
+    # These lists contain tuples (freq, tag, tag_id)
+    tag_by_category = {cat: [] for cat in [TAG_ARTIST, TAG_CHARACTER, TAG_COPYRIGHT, TAG_SPECIES]}
+    other_tags = []
+    implied = set()
+    for tag in tags:
+        tag = tag.strip()
+        # Encode the tag into a numerical id
+        tag_id = encode(tag.replace(' ', '_'))
+        if tag_id is None:
+            other_tags.append((0, tag, None))
+            implied.update(tagset_normalizer.implications_rej.get(tag_id, ()))
+            continue
+        # Get the category of the tag
+        cat_id = tag_id_to_cat_id[tag_id]
+        # Skip meta tags
+        if cat_id == TAG_META:
+            continue
+        implied.update(tagset_normalizer.implications.get(tag_id, ()))
+        # Get the frequency of the tag
+        freq = tag_rank_to_freq(tag_id)
+        if freq < TAG_FREQ_THRESH:
+            continue
+        tag_by_category.get(cat_id, other_tags).append((freq, tag, tag_id))
+    other_tags = sorted((freq, tag) for freq, tag, tag_id in other_tags if tag_id not in implied)
+    for cat_id, cat_list in tag_by_category.items():
+        tag_by_category[cat_id] = sorted((freq, tag) for freq, tag, tag_id in cat_list if tag_id not in implied)
+    if args.random_tags is not None:
+        # Randomly select tags if --random-tags is specified
+        num_tags = min(args.random_tags, len(other_tags))
+        other_tags = random.sample(tags[:round(args.random_tags * 1.5)], num_tags)
+    elif args.feed_from_tags > 0:
+        # Use specified number of tags if --feed-from-tags has a positive value
+        other_tags = other_tags[:args.feed_from_tags]
+    # Prepare sentence pieces
+    artist_tag = tag_by_category[TAG_ARTIST]
+    if artist_tag:
+        artist_txt = f' by {format_nl_list([tag.removeprefix('by ') for _, tag in artist_tag[:4]])}'
+    else:
+        artist_txt = ''
+    character_tag = tag_by_category[TAG_CHARACTER]
+    if character_tag:
+        character_txt = f' named {format_nl_list([tag for _, tag in character_tag[:4]])}'
+    else:
+        character_txt = ''
+    species_tag = tag_by_category[TAG_SPECIES]
+    if species_tag:
+        species_txt = f' of{" a" if len(character_tag) <= 1 and len(species_tag) <= 1 else ""} {format_nl_list([tag for _, tag in species_tag[:4]])}'
+    else:
+        if character_tag:
+            species_txt = f' of{" a character" if len(character_tag) <= 1 else " characters"}'
+        else:
+            species_txt = ''
+    copyright_tag = tag_by_category[TAG_COPYRIGHT]
+    if copyright_tag:
+        copyright_txt = f' from {format_nl_list([tag for _, tag in copyright_tag[:4]])}'
+    else:
+        copyright_txt = ''
+    tag_string = ', '.join(tag for _, tag in other_tags)
+    custom_prompt = f"Write a descriptive caption for this image{artist_txt}{species_txt}{character_txt}{copyright_txt} in a formal tone. Use these tags as context clues to construct your caption: {tag_string}"
+    return custom_prompt
 def find_tag_file(image_path):
     """
     Find the corresponding .txt file for the given image path.