Spaces:

p1atdev
/

danbooru-tags-transformer-v2

Running on Zero

App Files Files Community

p1atdev commited on May 4

Commit

a1db0e9

•

1 Parent(s): 7081246

feat: dartrs backend

Browse files

Files changed (6) hide show

app.py +64 -75
diffusion.py +4 -1
lpw_pipeline_xl.py +0 -0
requirements.txt +2 -1
utils.py +15 -0
v2.py +57 -76

app.py CHANGED Viewed

@@ -6,17 +6,28 @@ import gradio as gr
 from v2 import V2UI
 from diffusion import ImageGenerator
 from output import UpsamplingOutput
-from utils import QUALITY_TAGS, NEGATIVE_PROMPT, IMAGE_SIZE_OPTIONS, IMAGE_SIZES
 def animagine_xl_v3_1(output: UpsamplingOutput):
     return ", ".join(
         [
             part.strip()
             for part in [
                 output.character_tags,
                 output.copyright_tags,
-                output.general_tags,
                 output.upsampled_tags,
                 (
                     output.rating_tag
@@ -35,59 +46,29 @@ def elapsed_time_format(elapsed_time: float) -> str:
 def parse_upsampling_output(
     upsampler: Callable[..., UpsamplingOutput],
-    image_generator: Callable[..., Image.Image],
 ):
-    def _parse_upsampling_output(
-        generate_image: bool, *args
-    ) -> tuple[str, str, Image.Image | None]:
         output = upsampler(*args)
         print(output)
-        if not generate_image:
-            return (
-                animagine_xl_v3_1(output),
-                elapsed_time_format(output.elapsed_time),
-                None,
-            )
-        # generate image
-        [
-            image_size_option,
-            quality_tags,
-            negative_prompt,
-            num_inference_steps,
-            guidance_scale,
-        ] = args[
-            7:
-        ]  # remove the first 7 arguments for upsampler
-        width, height = IMAGE_SIZES[image_size_option]
-        image = image_generator(
-            ", ".join([animagine_xl_v3_1(output), quality_tags]),
-            negative_prompt,
-            height,
-            width,
-            num_inference_steps,
-            guidance_scale,
-        )
         return (
             animagine_xl_v3_1(output),
             elapsed_time_format(output.elapsed_time),
-            image,
         )
     return _parse_upsampling_output
-def toggle_visible_output_image(generate_image: bool):
-    return gr.update(
-        visible=generate_image,
-    )
 def image_generation_config_ui():
-    with gr.Accordion(label="Image generation config", open=True) as accordion:
         image_size = gr.Radio(
             label="Image size",
             choices=list(IMAGE_SIZE_OPTIONS.keys()),
@@ -142,7 +123,7 @@ def main():
     v2 = V2UI()
     print("Loading diffusion model...")
-    image_generator = ImageGenerator()
     print("Loaded.")
     with gr.Blocks() as ui:
@@ -152,25 +133,25 @@ def main():
             with gr.Column():
                 v2.ui()
-                generate_image_check = gr.Checkbox(
-                    label="Also generate image", value=True
                 )
                 accordion, image_generation_config_components = (
                     image_generation_config_ui()
                 )
-            with gr.Column():
-                output_text = gr.TextArea(label="Output tags", interactive=False)
-                elapsed_time_md = gr.Markdown(label="Elapsed time", value="")
                 output_image = gr.Gallery(
                     label="Output image",
                     columns=1,
                     preview=True,
                     show_label=False,
-                    visible=True,
                 )
                 gr.Examples(
@@ -179,78 +160,86 @@ def main():
                             "original",
                             "",
                             "1girl, solo, blue theme, limited palette",
-                            "lax",
                             "long",
-                            "1536x640",
                         ],
                         [
                             "",
                             "",
                             "4girls",
-                            "none",
                             "very_long",
-                            "768x1344",
                         ],
                         [
                             "",
                             "",
                             "no humans, scenery, spring (season)",
-                            "none",
                             "medium",
-                            "1536x640",
                         ],
                         [
                             "sousou no frieren",
                             "frieren",
                             "1girl, solo",
-                            "none",
                             "long",
-                            "768x1344",
                         ],
                         [
                             "honkai: star rail",
                             "silver wolf (honkai: star rail)",
                             "1girl, solo, annoyed",
-                            "none",
                             "long",
-                            "768x1344",
                         ],
                         [
                             "bocchi the rock!",
                             "gotoh hitori, kita ikuyo, ijichi nijika, yamada ryo",
                             "4girls, multiple girls",
-                            "none",
                             "very_long",
-                            "1344x768",
                         ],
                         [
                             "chuunibyou demo koi ga shitai!",
                             "takanashi rikka",
                             "1girl, solo",
-                            "none",
                             "long",
-                            "640x1536",
                         ],
                     ],
                     inputs=[
-                        *v2.get_inputs()[1:6],
-                        image_generation_config_components[0],  # image size
                     ],
                 )
         v2.get_generate_btn().click(
-            parse_upsampling_output(v2.on_generate, image_generator.generate),
             inputs=[
-                generate_image_check,
                 *v2.get_inputs(),
-                *image_generation_config_components,
             ],
-            outputs=[output_text, elapsed_time_md, output_image],
-        )
-        generate_image_check.change(
-            toggle_visible_output_image,
-            inputs=[generate_image_check],
-            outputs=[output_image],
         )
     ui.launch()

 from v2 import V2UI
 from diffusion import ImageGenerator
 from output import UpsamplingOutput
+from utils import QUALITY_TAGS, NEGATIVE_PROMPT, IMAGE_SIZE_OPTIONS, PEOPLE_TAGS
 def animagine_xl_v3_1(output: UpsamplingOutput):
+    # separate people tags (e.g. 1girl)
+    people_tags = []
+    other_general_tags = []
+    for tag in output.general_tags.split(","):
+        tag = tag.strip()
+        if tag in PEOPLE_TAGS:
+            people_tags.append(tag)
+        else:
+            other_general_tags.append(tag)
     return ", ".join(
         [
             part.strip()
             for part in [
+                *people_tags,
                 output.character_tags,
                 output.copyright_tags,
+                *other_general_tags,
                 output.upsampled_tags,
                 (
                     output.rating_tag
 def parse_upsampling_output(
     upsampler: Callable[..., UpsamplingOutput],
 ):
+    def _parse_upsampling_output(*args) -> tuple[
+        str,
+        str,
+        dict,
+    ]:
         output = upsampler(*args)
         print(output)
         return (
             animagine_xl_v3_1(output),
             elapsed_time_format(output.elapsed_time),
+            gr.update(
+                interactive=True,
+            ),
         )
     return _parse_upsampling_output
 def image_generation_config_ui():
+    with gr.Accordion(label="Image generation config", open=False) as accordion:
         image_size = gr.Radio(
             label="Image size",
             choices=list(IMAGE_SIZE_OPTIONS.keys()),
     v2 = V2UI()
     print("Loading diffusion model...")
+    # image_generator = ImageGenerator()
     print("Loaded.")
     with gr.Blocks() as ui:
             with gr.Column():
                 v2.ui()
+            with gr.Column():
+                output_text = gr.TextArea(label="Output tags", interactive=False)
+                elapsed_time_md = gr.Markdown(label="Elapsed time", value="")
+                generate_image_btn = gr.Button(
+                    value="Generate image with this prompt!",
                 )
                 accordion, image_generation_config_components = (
                     image_generation_config_ui()
                 )
                 output_image = gr.Gallery(
                     label="Output image",
                     columns=1,
                     preview=True,
                     show_label=False,
+                    visible=False,
                 )
                 gr.Examples(
                             "original",
                             "",
                             "1girl, solo, blue theme, limited palette",
+                            "sfw",
+                            "ultra_wide",
                             "long",
+                            "lax",
                         ],
                         [
                             "",
                             "",
                             "4girls",
+                            "sfw",
+                            "tall",
                             "very_long",
+                            "lax",
+                        ],
+                        [
+                            "original",
+                            "",
+                            "1girl, solo, upper body, looking at viewer, profile picture",
+                            "sfw",
+                            "square",
+                            "medium",
+                            "none",
                         ],
                         [
                             "",
                             "",
                             "no humans, scenery, spring (season)",
+                            "general",
+                            "ultra_wide",
                             "medium",
+                            "lax",
                         ],
                         [
                             "sousou no frieren",
                             "frieren",
                             "1girl, solo",
+                            "general",
+                            "tall",
                             "long",
+                            "lax",
                         ],
                         [
                             "honkai: star rail",
                             "silver wolf (honkai: star rail)",
                             "1girl, solo, annoyed",
+                            "sfw",
+                            "tall",
                             "long",
+                            "lax",
                         ],
                         [
                             "bocchi the rock!",
                             "gotoh hitori, kita ikuyo, ijichi nijika, yamada ryo",
                             "4girls, multiple girls",
+                            "sfw",
+                            "ultra_wide",
                             "very_long",
+                            "lax",
                         ],
                         [
                             "chuunibyou demo koi ga shitai!",
                             "takanashi rikka",
                             "1girl, solo",
+                            "sfw",
+                            "ultra_tall",
                             "long",
+                            "lax",
                         ],
                     ],
                     inputs=[
+                        *v2.get_inputs()[1:8],
                     ],
                 )
         v2.get_generate_btn().click(
+            parse_upsampling_output(v2.on_generate),
             inputs=[
                 *v2.get_inputs(),
             ],
+            outputs=[output_text, elapsed_time_md, generate_image_btn],
         )
     ui.launch()

diffusion.py CHANGED Viewed

@@ -22,6 +22,9 @@ except ImportError:
 from utils import NEGATIVE_PROMPT
 class ImageGenerator:
     pipe: StableDiffusionXLPipeline
@@ -41,7 +44,7 @@ class ImageGenerator:
         # sdpa
         self.pipe.unet.set_attn_processor(AttnProcessor2_0())
-        self.pipe.to("cuda")
         try:
             self.pipe = torch.compile(self.pipe)

 from utils import NEGATIVE_PROMPT
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 class ImageGenerator:
     pipe: StableDiffusionXLPipeline
         # sdpa
         self.pipe.unet.set_attn_processor(AttnProcessor2_0())
+        self.pipe.to(device)
         try:
             self.pipe = torch.compile(self.pipe)

lpw_pipeline_xl.py ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ accelerate==0.29.2
 transformers==4.38.2
 optimum[onnxruntime]==1.19.1
 diffusers==0.27.2
-spaces==0.26.2

 transformers==4.38.2
 optimum[onnxruntime]==1.19.1
 diffusers==0.27.2
+spaces==0.26.2
+git+https://github.com/p1atdev/dartrs.git@33cdcfe77f236ba286ad60e10db8a5650e150fd2

utils.py CHANGED Viewed

@@ -22,6 +22,13 @@ IMAGE_SIZES = {
     "640x1536": (640, 1536),
 }
 RATING_OPTIONS = {
     "sfw": "<|rating:sfw|>",
     "general": "<|rating:general|>",
@@ -42,3 +49,11 @@ IDENTITY_OPTIONS = {
     "lax": "<|identity:lax|>",
     "strict": "<|identity:strict|>",
 }

     "640x1536": (640, 1536),
 }
+ASPECT_RATIO_OPTIONS = {
+    "ultra_wide": "<|aspect_ratio:ultra_wide|>",
+    "wide": "<|aspect_ratio:wide|>",
+    "square": "<|aspect_ratio:square|>",
+    "tall": "<|aspect_ratio:tall|>",
+    "ultra_tall": "<|aspect_ratio:ultra_tall|>",
+}
 RATING_OPTIONS = {
     "sfw": "<|rating:sfw|>",
     "general": "<|rating:general|>",
     "lax": "<|identity:lax|>",
     "strict": "<|identity:strict|>",
 }
+PEOPLE_TAGS = [
+    *[f"1{x}" for x in ["girl", "boy", "other"]],
+    *[f"{i}girls" for i in range(2, 6)],
+    *[f"6+{x}s" for x in ["girl", "boy", "other"]],
+    "no humans",
+]

v2.py CHANGED Viewed

@@ -1,7 +1,11 @@
 import time
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
 import gradio as gr
 from gradio.components import Component
@@ -16,31 +20,26 @@ except ImportError:
 from output import UpsamplingOutput
-from utils import IMAGE_SIZE_OPTIONS, RATING_OPTIONS, LENGTH_OPTIONS, IDENTITY_OPTIONS
 ALL_MODELS = {
-    "dart-v2-llama-100m-sft": {
-        "repo": "p1atdev/dart-v2-llama-100m-sft",
-        "type": "sft",
-    },
-    "dart-v2-mistral-100m-sft": {
-        "repo": "p1atdev/dart-v2-mistral-100m-sft",
         "type": "sft",
     },
-    "dart-v2-mixtral-160m-sft": {
-        "repo": "p1atdev/dart-v2-mixtral-160m-sft",
         "type": "sft",
     },
 }
-def prepare_models(model_name: str):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.bfloat16,
-        device_map="auto",
-    )
     return {
         "tokenizer": tokenizer,
@@ -48,21 +47,21 @@ def prepare_models(model_name: str):
     }
-def normalize_tags(tokenizer: PreTrainedTokenizerBase, tags: str):
-    """Just remove unk tokens."""
-    return ", ".join(
-        tokenizer.batch_decode(
-            [
-                token
-                for token in tokenizer.encode_plus(
-                    tags.strip(),
-                    return_tensors="pt",
-                ).input_ids[0]
-                if int(token) != tokenizer.unk_token_id
-            ],
-            skip_special_tokens=True,
-        )
-    )
 def compose_prompt(
@@ -88,46 +87,28 @@ def compose_prompt(
 @torch.no_grad()
 @spaces.GPU(duration=5)
 def generate_tags(
-    model,
-    tokenizer: PreTrainedTokenizerBase,
     prompt: str,
 ):
-    print(  # debug
-        tokenizer.tokenize(
-            prompt,
-            add_special_tokens=False,
-        )
-    )
-    input_ids = tokenizer.encode_plus(prompt, return_tensors="pt").input_ids
     output = model.generate(
-        input_ids.to(model.device),
-        do_sample=True,
-        temperature=1,
-        top_p=0.9,
-        top_k=100,
-        num_beams=1,
-        num_return_sequences=1,
-        max_length=256,
     )
-    # remove input tokens
-    pure_output_ids = output[0][len(input_ids[0]) :]
-    return ", ".join(
-        [
-            token
-            for token in tokenizer.batch_decode(
-                pure_output_ids, skip_special_tokens=True
-            )
-            if token.strip() != ""
-        ]
-    )
 class V2UI:
     model_name: str | None = None
-    model: AutoModelForCausalLM
-    tokenizer: PreTrainedTokenizerBase
     input_components: list[Component] = []
     generate_btn: gr.Button
@@ -139,25 +120,25 @@ class V2UI:
         character_tags: str,
         general_tags: str,
         rating_option: str,
-        # aspect_ratio_option: str,
         length_option: str,
         identity_option: str,
-        image_size: str,  # this is from image generation config
         *args,
     ) -> UpsamplingOutput:
         if self.model_name is None or self.model_name != model_name:
-            models = prepare_models(ALL_MODELS[model_name]["repo"])
             self.model = models["model"]
             self.tokenizer = models["tokenizer"]
             self.model_name = model_name
         # normalize tags
-        copyright_tags = normalize_tags(self.tokenizer, copyright_tags)
-        character_tags = normalize_tags(self.tokenizer, character_tags)
-        general_tags = normalize_tags(self.tokenizer, general_tags)
         rating_tag = RATING_OPTIONS[rating_option]
-        aspect_ratio_tag = IMAGE_SIZE_OPTIONS[image_size]
         length_tag = LENGTH_OPTIONS[length_option]
         identity_tag = IDENTITY_OPTIONS[identity_option]
@@ -212,11 +193,11 @@ class V2UI:
             choices=list(RATING_OPTIONS.keys()),
             value="general",
         )
-        # input_aspect_ratio = gr.Radio(
-        #     label="Aspect ratio",
-        #     choices=["ultra_wide", "wide", "square", "tall", "ultra_tall"],
-        #     value="tall",
-        # )
         input_length = gr.Radio(
             label="Length",
             choices=list(LENGTH_OPTIONS.keys()),
@@ -242,7 +223,7 @@ class V2UI:
             input_character,
             input_general,
             input_rating,
-            # input_aspect_ratio,
             input_length,
             input_identity,
         ]

 import time
 import torch
+from dartrs.v2 import V2Model, MixtralModel
+from dartrs.dartrs import DartTokenizer
+from dartrs.utils import get_generation_config
 import gradio as gr
 from gradio.components import Component
 from output import UpsamplingOutput
+from utils import ASPECT_RATIO_OPTIONS, RATING_OPTIONS, LENGTH_OPTIONS, IDENTITY_OPTIONS
 ALL_MODELS = {
+    "dart-v2-mixtral-160m-sft-6": {
+        "repo": "p1atdev/dart-v2-mixtral-160m-sft-6",
         "type": "sft",
+        "class": MixtralModel,
     },
+    "dart-v2-mixtral-160m-sft-8": {
+        "repo": "p1atdev/dart-v2-mixtral-160m-sft-8",
         "type": "sft",
+        "class": MixtralModel,
     },
 }
+def prepare_models(model_config: dict):
+    model_name = model_config["repo"]
+    tokenizer = DartTokenizer.from_pretrained(model_name)
+    model = model_config["class"].from_pretrained(model_name)
     return {
         "tokenizer": tokenizer,
     }
+# def normalize_tags(tokenizer: PreTrainedTokenizerBase, tags: str):
+#     """Just remove unk tokens."""
+#     return ", ".join(
+#         tokenizer.batch_decode(
+#             [
+#                 token
+#                 for token in tokenizer.encode_plus(
+#                     tags.strip(),
+#                     return_tensors="pt",
+#                 ).input_ids[0]
+#                 if int(token) != tokenizer.unk_token_id
+#             ],
+#             skip_special_tokens=True,
+#         )
+#     )
 def compose_prompt(
 @torch.no_grad()
 @spaces.GPU(duration=5)
 def generate_tags(
+    model: V2Model,
+    tokenizer: DartTokenizer,
     prompt: str,
 ):
     output = model.generate(
+        get_generation_config(
+            prompt,
+            tokenizer=tokenizer,
+            temperature=1,
+            top_p=0.9,
+            top_k=100,
+            max_new_tokens=256,
+        ),
     )
+    return output
 class V2UI:
     model_name: str | None = None
+    model: V2Model
+    tokenizer: DartTokenizer
     input_components: list[Component] = []
     generate_btn: gr.Button
         character_tags: str,
         general_tags: str,
         rating_option: str,
+        aspect_ratio_option: str,
         length_option: str,
         identity_option: str,
+        # image_size: str,  # this is from image generation config
         *args,
     ) -> UpsamplingOutput:
         if self.model_name is None or self.model_name != model_name:
+            models = prepare_models(ALL_MODELS[model_name])
             self.model = models["model"]
             self.tokenizer = models["tokenizer"]
             self.model_name = model_name
         # normalize tags
+        # copyright_tags = normalize_tags(self.tokenizer, copyright_tags)
+        # character_tags = normalize_tags(self.tokenizer, character_tags)
+        # general_tags = normalize_tags(self.tokenizer, general_tags)
         rating_tag = RATING_OPTIONS[rating_option]
+        aspect_ratio_tag = ASPECT_RATIO_OPTIONS[aspect_ratio_option]
         length_tag = LENGTH_OPTIONS[length_option]
         identity_tag = IDENTITY_OPTIONS[identity_option]
             choices=list(RATING_OPTIONS.keys()),
             value="general",
         )
+        input_aspect_ratio = gr.Radio(
+            label="Aspect ratio",
+            choices=["ultra_wide", "wide", "square", "tall", "ultra_tall"],
+            value="tall",
+        )
         input_length = gr.Radio(
             label="Length",
             choices=list(LENGTH_OPTIONS.keys()),
             input_character,
             input_general,
             input_rating,
+            input_aspect_ratio,
             input_length,
             input_identity,
         ]