#!/usr/bin/env python import cv2 import numpy as np import torch import random import base64 import json import threading import uuid import math import io from PIL import Image from diffusers import ( AutoencoderKL, StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler, StableDiffusionControlNetImg2ImgPipeline, StableDiffusionXLControlNetPipeline, DiffusionPipeline, ) from diffusers.utils import load_image from transformers import pipeline import gradio as gr vae = AutoencoderKL.from_pretrained( "stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16 ) canny_controlnet = ControlNetModel.from_pretrained( "lllyasviel/control_v11p_sd15_canny", torch_dtype=torch.float16 ) canny_pipe = StableDiffusionControlNetPipeline.from_pretrained( "SG161222/Realistic_Vision_V3.0_VAE", controlnet=canny_controlnet, torch_dtype=torch.float16, use_safetensors=True, ) canny_controlnet_tile = ControlNetModel.from_pretrained( "lllyasviel/control_v11f1e_sd15_tile", torch_dtype=torch.float16 ) canny_pipe_img2img = StableDiffusionControlNetImg2ImgPipeline.from_pretrained( "SG161222/Realistic_Vision_V3.0_VAE", controlnet=canny_controlnet_tile, torch_dtype=torch.float16, use_safetensors=True, ) canny_pipe_img2img.enable_model_cpu_offload() canny_pipe_img2img.enable_xformers_memory_efficient_attention() canny_pipe.scheduler = UniPCMultistepScheduler.from_config(canny_pipe.scheduler.config) canny_pipe.enable_model_cpu_offload() canny_pipe.enable_xformers_memory_efficient_attention() controlnet_xl = ControlNetModel.from_pretrained( "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16 ) vae_xl = AutoencoderKL.from_pretrained( "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 ) pipe_xl = StableDiffusionXLControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet_xl, vae=vae_xl, torch_dtype=torch.float16, use_safetensors=True, variant="fp16", ) pipe_xl.scheduler = UniPCMultistepScheduler.from_config(pipe_xl.scheduler.config) pipe_xl.enable_xformers_memory_efficient_attention() pipe_xl.enable_model_cpu_offload() refiner = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-1.0", text_encoder_2=pipe_xl.text_encoder_2, vae=pipe_xl.vae, torch_dtype=torch.float16, use_safetensors=True, variant="fp16", ) refiner.enable_xformers_memory_efficient_attention() refiner.enable_model_cpu_offload() def resize_image_output(im, width, height): im = np.array(im) newSize = (width, height) img = cv2.resize(im, newSize, interpolation=cv2.INTER_CUBIC) img = Image.fromarray(img) return img def resize_image(im, max_size=590000): [x, y, z] = im.shape new_size = [0, 0] min_size = 262144 if x * y > max_size: scale_ratio = math.sqrt((x * y) / max_size) new_size[0] = int(x / scale_ratio) new_size[1] = int(y / scale_ratio) elif x * y <= min_size: scale_ratio = math.sqrt((x * y) / min_size) new_size[0] = int(x / scale_ratio) new_size[1] = int(y / scale_ratio) else: new_size[0] = int(x) new_size[1] = int(y) height = (new_size[0] // 8) * 8 width = (new_size[1] // 8) * 8 newSize = (width, height) img = cv2.resize(im, newSize, interpolation=cv2.INTER_CUBIC) return img def process_canny_tile( input_image, control_image, x, y, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength_conditioning, scale, seed, eta, low_threshold, high_threshold, ): image = input_image return canny_pipe_img2img( prompt="", image=image, control_image=image, num_inference_steps=20, guidance_scale=4, strength=0.3, guess_mode=True, negative_prompt=n_prompt, num_images_per_prompt=1, eta=eta, generator=torch.Generator(device="cpu").manual_seed(seed), ) def process_canny( input_image, x, y, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, low_threshold, high_threshold, ): image = input_image return canny_pipe( prompt=",".join([prompt, a_prompt]), image=image, height=x, width=y, num_inference_steps=ddim_steps, guidance_scale=scale, negative_prompt=n_prompt, num_images_per_prompt=num_samples, eta=eta, controlnet_conditioning_scale=strength, generator=torch.Generator(device="cpu").manual_seed(seed), ) def process_canny_sdxl( input_image, x, y, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, guess_mode, strength, scale, seed, eta, low_threshold, high_threshold, ): image = input_image image = pipe_xl( prompt=",".join([prompt, a_prompt]), image=image, height=x, width=y, num_inference_steps=ddim_steps, guidance_scale=scale, negative_prompt=n_prompt, num_images_per_prompt=num_samples, eta=eta, controlnet_conditioning_scale=strength, generator=torch.Generator(device="cpu").manual_seed(seed), output_type="latent", ).images return refiner( prompt=prompt, num_inference_steps=ddim_steps, num_images_per_prompt=num_samples, denoising_start=0.8, image=image, ) def process( image, prompt, a_prompt, n_prompt, ddim_steps, strength, scale, seed, eta, low_threshold, high_threshold, ): image = load_image(image) image = np.array(image) [x_orig, y_orig, z_orig] = image.shape image = resize_image(image) [x, y, z] = image.shape image = cv2.Canny(image, low_threshold, high_threshold) image = image[:, :, None] image = np.concatenate([image, image, image], axis=2) image = Image.fromarray(image) result = process_canny( image, x, y, prompt, a_prompt, n_prompt, 1, None, ddim_steps, False, float(strength), scale, seed, eta, low_threshold, high_threshold, ) im = result.images[0] im = resize_image_output(im, y_orig, x_orig) highres = False if highres: result_upscaled = process_canny_tile( im, im, x_orig, y_orig, prompt, a_prompt, n_prompt, num_samples, None, ddim_steps, False, strength, scale, seed, eta, low_threshold, high_threshold, ) im = result_upscaled.images[0] return im demo = gr.Blocks().queue() with demo: with gr.Row(): gr.Markdown("## Control Stable Diffusion with Canny Edge Maps") with gr.Row(): with gr.Column(): input_image = gr.Image(type="pil", label="Input Image") input_prompt = gr.Textbox() run_button = gr.Button(label="Run") with gr.Accordion("Advanced Options", open=False): strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01, ) low_threshold = gr.Slider( label="Canny low threshold", minimum=1, maximum=255, value=100, step=1, ) high_threshold = gr.Slider( label="Canny high threshold", minimum=1, maximum=255, value=200, step=1, ) ddim_steps = gr.Slider( label="Steps", minimum=1, maximum=100, value=20, step=1 ) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=7.5, step=0.1, ) # default value was 9.0 seed = gr.Slider( label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True ) eta = gr.Number(label="eta (DDIM)", value=0.0) a_prompt = gr.Textbox( label="Added Prompt", value="best quality, extremely detailed" ) n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): result = gr.Image(label="Output", type="numpy") ips = [ input_image, input_prompt, a_prompt, n_prompt, ddim_steps, strength, scale, seed, eta, low_threshold, high_threshold, ] run_button.click(fn=process, inputs=ips, outputs=[result]) demo.launch()