#!/usr/bin/env python from __future__ import annotations import argparse import functools import html import os import gradio as gr import huggingface_hub import numpy as np import onnxruntime as rt import pandas as pd import piexif import piexif.helper import PIL.Image from Utils import dbimutils TITLE = "WaifuDiffusion v1.4 Tags" DESCRIPTION = """ Demo for [SmilingWolf/wd-v1-4-vit-tagger](https://huggingface.co/SmilingWolf/wd-v1-4-vit-tagger) and [SmilingWolf/wd-v1-4-convnext-tagger](https://huggingface.co/SmilingWolf/wd-v1-4-convnext-tagger) with "ready to copy" prompt and a prompt analyzer. Modified from [NoCrypt/DeepDanbooru_string](https://huggingface.co/spaces/NoCrypt/DeepDanbooru_string) Modified from [hysts/DeepDanbooru](https://huggingface.co/spaces/hysts/DeepDanbooru) PNG Info code forked from [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) Example image by [ほし☆☆☆](https://www.pixiv.net/en/users/43565085) """ HF_TOKEN = os.environ["HF_TOKEN"] VIT_MODEL_REPO = "SmilingWolf/wd-v1-4-vit-tagger" CONV_MODEL_REPO = "SmilingWolf/wd-v1-4-convnext-tagger" MODEL_FILENAME = "model.onnx" LABEL_FILENAME = "selected_tags.csv" def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("--score-slider-step", type=float, default=0.05) parser.add_argument("--score-threshold", type=float, default=0.35) parser.add_argument("--share", action="store_true") return parser.parse_args() def load_model(model_repo: str, model_filename: str) -> rt.InferenceSession: path = huggingface_hub.hf_hub_download( model_repo, model_filename, use_auth_token=HF_TOKEN ) model = rt.InferenceSession(path) return model def load_labels() -> list[str]: path = huggingface_hub.hf_hub_download( VIT_MODEL_REPO, LABEL_FILENAME, use_auth_token=HF_TOKEN ) df = pd.read_csv(path)["name"].tolist() return df def plaintext_to_html(text): text = ( "

" + "
\n".join([f"{html.escape(x)}" for x in text.split("\n")]) + "

" ) return text def predict( image: PIL.Image.Image, selected_model: str, score_threshold: float, models: dict, labels: list[str], ): rawimage = image model = models[selected_model] _, height, width, _ = model.get_inputs()[0].shape # Alpha to white image = image.convert("RGBA") new_image = PIL.Image.new("RGBA", image.size, "WHITE") new_image.paste(image, mask=image) image = new_image.convert("RGB") image = np.asarray(image) # PIL RGB to OpenCV BGR image = image[:, :, ::-1] image = dbimutils.make_square(image, height) image = dbimutils.smart_resize(image, height) image = image.astype(np.float32) image = np.expand_dims(image, 0) input_name = model.get_inputs()[0].name label_name = model.get_outputs()[0].name probs = model.run([label_name], {input_name: image})[0] labels = list(zip(labels, probs[0].astype(float))) # First 4 labels are actually ratings: pick one with argmax ratings_names = labels[:4] rating = dict(ratings_names) # Everything else is tags: pick any where prediction confidence > threshold tags_names = labels[4:] res = [x for x in tags_names if x[1] > score_threshold] res = dict(res) b = dict(sorted(res.items(), key=lambda item: item[1], reverse=True)) a = ( ", ".join(list(b.keys())) .replace("_", " ") .replace("(", "\(") .replace(")", "\)") ) c = ", ".join(list(b.keys())) items = rawimage.info geninfo = "" if "exif" in rawimage.info: exif = piexif.load(rawimage.info["exif"]) exif_comment = (exif or {}).get("Exif", {}).get(piexif.ExifIFD.UserComment, b"") try: exif_comment = piexif.helper.UserComment.load(exif_comment) except ValueError: exif_comment = exif_comment.decode("utf8", errors="ignore") items["exif comment"] = exif_comment geninfo = exif_comment for field in [ "jfif", "jfif_version", "jfif_unit", "jfif_density", "dpi", "exif", "loop", "background", "timestamp", "duration", ]: items.pop(field, None) geninfo = items.get("parameters", geninfo) info = f"""

PNG Info

""" for key, text in items.items(): info += ( f"""

{plaintext_to_html(str(key))}

{plaintext_to_html(str(text))}

""".strip() + "\n" ) if len(info) == 0: message = "Nothing found in the image." info = f"

{message}

" return (a, c, rating, res, info) def main(): args = parse_args() vit_model = load_model(VIT_MODEL_REPO, MODEL_FILENAME) conv_model = load_model(CONV_MODEL_REPO, MODEL_FILENAME) labels = load_labels() models = {"ViT": vit_model, "ConvNext": conv_model} func = functools.partial(predict, models=models, labels=labels) gr.Interface( fn=func, inputs=[ gr.Image(type="pil", label="Input"), gr.Radio(["ViT", "ConvNext"], value="ViT", label="Model"), gr.Slider( 0, 1, step=args.score_slider_step, value=args.score_threshold, label="Score Threshold", ), ], outputs=[ gr.Textbox(label="Output (string)"), gr.Textbox(label="Output (raw string)"), gr.Label(label="Rating"), gr.Label(label="Output (label)"), gr.HTML(), ], examples=[["power.jpg", "ViT", 0.5]], title=TITLE, description=DESCRIPTION, allow_flagging="never", ).launch( enable_queue=True, share=args.share, ) if __name__ == "__main__": main()