Spaces:

nyanko7
/

sd-diffusers-webui

Running on T4

App Files Files Community

sd-diffusers-webui / app.py

nyanko7

feat: requirements, cleanup

1e66485 over 1 year ago

raw

history blame

No virus

26.5 kB

	import random
	import tempfile
	import time
	import gradio as gr
	import numpy as np
	import torch
	import math

	from gradio import inputs
	from diffusers import (
	AutoencoderKL,
	DDIMScheduler,
	UNet2DConditionModel,
	)
	from modules.model import (
	CrossAttnProcessor,
	StableDiffusionPipeline,
	)
	from torchvision import transforms
	from transformers import CLIPTokenizer, CLIPTextModel
	from PIL import Image
	from pathlib import Path
	from safetensors.torch import load_file
	import modules.safe as _
	from modules.lora import LoRANetwork

	models = [
	# format: name, model_path, clip_skip
	("AbyssOrangeMix2", "Korakoe/AbyssOrangeMix2-HF", 2),
	("Basil Mix", "nuigurumi/basil_mix", 2),
	("Pastal Mix", "andite/pastel-mix", 2),
	("ACertainModel", "JosephusCheung/ACertainModel", 2),
	]

	base_name, base_model, clip_skip = models[0]

	samplers_k_diffusion = [
	("Euler a", "sample_euler_ancestral", {}),
	("Euler", "sample_euler", {}),
	("LMS", "sample_lms", {}),
	("Heun", "sample_heun", {}),
	("DPM2", "sample_dpm_2", {"discard_next_to_last_sigma": True}),
	("DPM2 a", "sample_dpm_2_ancestral", {"discard_next_to_last_sigma": True}),
	("DPM++ 2S a", "sample_dpmpp_2s_ancestral", {}),
	("DPM++ 2M", "sample_dpmpp_2m", {}),
	("DPM++ SDE", "sample_dpmpp_sde", {}),
	("LMS Karras", "sample_lms", {"scheduler": "karras"}),
	("DPM2 Karras", "sample_dpm_2", {"scheduler": "karras", "discard_next_to_last_sigma": True}),
	("DPM2 a Karras", "sample_dpm_2_ancestral", {"scheduler": "karras", "discard_next_to_last_sigma": True}),
	("DPM++ 2S a Karras", "sample_dpmpp_2s_ancestral", {"scheduler": "karras"}),
	("DPM++ 2M Karras", "sample_dpmpp_2m", {"scheduler": "karras"}),
	("DPM++ SDE Karras", "sample_dpmpp_sde", {"scheduler": "karras"}),
	]

	# samplers_diffusers = [
	# ("DDIMScheduler", "diffusers.schedulers.DDIMScheduler", {})
	# ("DDPMScheduler", "diffusers.schedulers.DDPMScheduler", {})
	# ("DEISMultistepScheduler", "diffusers.schedulers.DEISMultistepScheduler", {})
	# ]

	start_time = time.time()

	scheduler = DDIMScheduler.from_pretrained(
	base_model,
	subfolder="scheduler",
	)
	vae = AutoencoderKL.from_pretrained(
	"stabilityai/sd-vae-ft-ema",
	torch_dtype=torch.float16
	)
	text_encoder = CLIPTextModel.from_pretrained(
	base_model,
	subfolder="text_encoder",
	torch_dtype=torch.float16,
	)
	tokenizer = CLIPTokenizer.from_pretrained(
	base_model,
	subfolder="tokenizer",
	torch_dtype=torch.float16,
	)
	unet = UNet2DConditionModel.from_pretrained(
	base_model,
	subfolder="unet",
	torch_dtype=torch.float16,
	)
	pipe = StableDiffusionPipeline(
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	vae=vae,
	scheduler=scheduler,
	)

	unet.set_attn_processor(CrossAttnProcessor)
	pipe.set_clip_skip(clip_skip)
	if torch.cuda.is_available():
	pipe = pipe.to("cuda")


	def get_model_list():
	return models

	unet_cache = {
	base_name: unet
	}

	lora_cache = {
	base_name: LoRANetwork(text_encoder, unet)
	}

	def get_model(name):
	keys = [k[0] for k in models]
	if name not in unet_cache:
	if name not in keys:
	raise ValueError(name)
	else:
	unet = UNet2DConditionModel.from_pretrained(
	models[keys.index(name)][1],
	subfolder="unet",
	torch_dtype=torch.float16,
	)
	if torch.cuda.is_available():
	unet.to("cuda")

	unet_cache[name] = unet
	lora_cache[name] = LoRANetwork(lora_cache[base_name].text_encoder_loras, unet)

	g_unet = unet_cache[name]
	g_lora = lora_cache[name]
	g_unet.set_attn_processor(CrossAttnProcessor())
	g_lora.reset()
	return g_unet, g_lora

	# precache on huggingface
	# for model in get_model_list():
	# get_model(model[0])

	def error_str(error, title="Error"):
	return (
	f"""#### {title}
	{error}"""
	if error
	else ""
	)


	te_base_weight_length = text_encoder.get_input_embeddings().weight.data.shape[0]
	original_prepare_for_tokenization = tokenizer.prepare_for_tokenization

	def make_token_names(embs):
	all_tokens = []
	for name, vec in embs.items():
	tokens = [f'emb-{name}-{i}' for i in range(len(vec))]
	all_tokens.append(tokens)
	return all_tokens

	def setup_tokenizer(embs):
	reg_match = [re.compile(fr"(?:^\|(?<=\s\|,)){k}(?=,\|\s\|$)") for k in embs.keys()]
	clip_keywords = [' '.join(s) for s in make_token_names(embs)]

	def parse_prompt(prompt: str):
	for m, v in zip(reg_match, clip_keywords):
	prompt = m.sub(v, prompt)
	return prompt


	def restore_all():
	global te_base_weight, tokenizer
	tokenizer.prepare_for_tokenization = original_prepare_for_tokenization

	embeddings = text_encoder.get_input_embeddings()
	text_encoder.get_input_embeddings().weight.data = embeddings.weight.data[:te_base_weight_length]
	tokenizer = CLIPTokenizer.from_pretrained(
	base_model,
	subfolder="tokenizer",
	torch_dtype=torch.float16,
	)

	def convert_size(size_bytes):
	if size_bytes == 0:
	return "0B"
	size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
	i = int(math.floor(math.log(size_bytes, 1024)))
	p = math.pow(1024, i)
	s = round(size_bytes / p, 2)
	return "%s %s" % (s, size_name[i])

	def inference(
	prompt,
	guidance,
	steps,
	width=512,
	height=512,
	seed=0,
	neg_prompt="",
	state=None,
	g_strength=0.4,
	img_input=None,
	i2i_scale=0.5,
	hr_enabled=False,
	hr_method="Latent",
	hr_scale=1.5,
	hr_denoise=0.8,
	sampler="DPM++ 2M Karras",
	embs=None,
	model=None,
	lora_state=None,
	lora_scale=None,
	):
	global pipe, unet, tokenizer, text_encoder
	if seed is None or seed == 0:
	seed = random.randint(0, 2147483647)

	start_time = time.time()
	restore_all()
	generator = torch.Generator("cuda").manual_seed(int(seed))

	local_unet, local_lora = get_model(model)
	if lora_state is not None and lora_state != "":
	local_lora.load(lora_state, lora_scale)
	local_lora.to(local_unet.device, dtype=local_unet.dtype)

	pipe.setup_unet(local_unet)
	sampler_name, sampler_opt = None, None
	for label, funcname, options in samplers_k_diffusion:
	if label == sampler:
	sampler_name, sampler_opt = funcname, options

	if embs is not None and len(embs) > 0:
	ti_embs = {}
	for name, file in embs.items():
	if str(file).endswith(".pt"):
	loaded_learned_embeds = torch.load(file, map_location="cpu")
	else:
	loaded_learned_embeds = load_file(file, device="cpu")
	loaded_learned_embeds = loaded_learned_embeds["string_to_param"]["*"]
	ti_embs[name] = loaded_learned_embeds

	if len(ti_embs) > 0:
	tokens = setup_tokenizer(ti_embs)
	added_tokens = tokenizer.add_tokens(tokens)
	delta_weight = torch.cat([val for val in ti_embs.values()], dim=0)

	assert added_tokens == delta_weight.shape[0]
	text_encoder.resize_token_embeddings(len(tokenizer))
	text_encoder.get_input_embeddings().weight.data[-delta_weight.shape[0]:] = delta_weight

	config = {
	"negative_prompt": neg_prompt,
	"num_inference_steps": int(steps),
	"guidance_scale": guidance,
	"generator": generator,
	"sampler_name": sampler_name,
	"sampler_opt": sampler_opt,
	"pww_state": state,
	"pww_attn_weight": g_strength,
	}

	if img_input is not None:
	ratio = min(height / img_input.height, width / img_input.width)
	img_input = img_input.resize(
	(int(img_input.width * ratio), int(img_input.height * ratio)), Image.LANCZOS
	)
	result = pipe.img2img(prompt, image=img_input, strength=i2i_scale, **config)
	elif hr_enabled:
	result = pipe.txt2img(
	prompt,
	width=width,
	height=height,
	upscale=True,
	upscale_x=hr_scale,
	upscale_denoising_strength=hr_denoise,
	**config,
	**latent_upscale_modes[hr_method],
	)
	else:
	result = pipe.txt2img(prompt, width=width, height=height, **config)

	# restore
	if embs is not None and len(embs) > 0:
	restore_all()

	end_time = time.time()
	vram_free, vram_total = torch.cuda.mem_get_info()
	print(f"done: res={width}x{height}, step={steps}, time={round(end_time-start_time, 2)}s, vram_alloc={convert_size(vram_total-vram_free)}/{convert_size(vram_total)}")
	return gr.Image.update(result[0][0], label=f"Initial Seed: {seed}")


	color_list = []


	def get_color(n):
	for _ in range(n - len(color_list)):
	color_list.append(tuple(np.random.random(size=3) * 256))
	return color_list


	def create_mixed_img(current, state, w=512, h=512):
	w, h = int(w), int(h)
	image_np = np.full([h, w, 4], 255)
	colors = get_color(len(state))
	idx = 0

	for key, item in state.items():
	if item["map"] is not None:
	m = item["map"] < 255
	alpha = 150
	if current == key:
	alpha = 200
	image_np[m] = colors[idx] + (alpha,)
	idx += 1

	return image_np


	# width.change(apply_new_res, inputs=[width, height, global_stats], outputs=[global_stats, sp, rendered])
	def apply_new_res(w, h, state):
	w, h = int(w), int(h)

	for key, item in state.items():
	if item["map"] is not None:
	item["map"] = resize(item["map"], w, h)

	update_img = gr.Image.update(value=create_mixed_img("", state, w, h))
	return state, update_img


	def detect_text(text, state, width, height):

	if text is None or text == "":
	return None, None, None, None

	t = text.split(",")
	new_state = {}

	for item in t:
	item = item.strip()
	if item == "":
	continue
	if item in state:
	new_state[item] = {
	"map": state[item]["map"],
	"weight": state[item]["weight"],
	"mask_outsides": state[item]["mask_outsides"],
	}
	else:
	new_state[item] = {
	"map": None,
	"weight": 0.5,
	"mask_outsides": False
	}
	update = gr.Radio.update(choices=[key for key in new_state.keys()], value=None)
	update_img = gr.update(value=create_mixed_img("", new_state, width, height))
	update_sketch = gr.update(value=None, interactive=False)
	return new_state, update_sketch, update, update_img


	def resize(img, w, h):
	trs = transforms.Compose(
	[
	transforms.ToPILImage(),
	transforms.Resize(min(h, w)),
	transforms.CenterCrop((h, w)),
	]
	)
	result = np.array(trs(img), dtype=np.uint8)
	return result


	def switch_canvas(entry, state, width, height):
	if entry == None:
	return None, 0.5, create_mixed_img("", state, width, height)

	return (
	gr.update(value=None, interactive=True),
	gr.update(value=state[entry]["weight"] if entry in state else 0.5),
	gr.update(value=state[entry]["mask_outsides"] if entry in state else False),
	create_mixed_img(entry, state, width, height),
	)


	def apply_canvas(selected, draw, state, w, h):
	if selected in state:
	w, h = int(w), int(h)
	state[selected]["map"] = resize(draw, w, h)
	return state, gr.Image.update(value=create_mixed_img(selected, state, w, h))


	def apply_weight(selected, weight, state):
	if selected in state:
	state[selected]["weight"] = weight
	return state


	def apply_option(selected, mask, state):
	if selected in state:
	state[selected]["mask_outsides"] = mask
	return state


	# sp2, radio, width, height, global_stats
	def apply_image(image, selected, w, h, strgength, mask, state):
	if selected in state:
	state[selected] = {
	"map": resize(image, w, h),
	"weight": strgength,
	"mask_outsides": mask
	}

	return state, gr.Image.update(value=create_mixed_img(selected, state, w, h))


	# [ti_state, lora_state, ti_vals, lora_vals, uploads]
	def add_net(files, ti_state, lora_state):
	if files is None:
	return ti_state, "", lora_state, None

	for file in files:
	item = Path(file.name)
	stripedname = str(item.stem).strip()
	if item.suffix == ".pt":
	state_dict = torch.load(file.name, map_location="cpu")
	else:
	state_dict = load_file(file.name, device="cpu")
	if any("lora" in k for k in state_dict.keys()):
	lora_state = file.name
	else:
	ti_state[stripedname] = file.name

	return (
	ti_state,
	lora_state,
	gr.Text.update(f"{[key for key in ti_state.keys()]}"),
	gr.Text.update(f"{lora_state}"),
	gr.Files.update(value=None),
	)


	# [ti_state, lora_state, ti_vals, lora_vals, uploads]
	def clean_states(ti_state, lora_state):
	return (
	dict(),
	None,
	gr.Text.update(f""),
	gr.Text.update(f""),
	gr.File.update(value=None),
	)


	latent_upscale_modes = {
	"Latent": {"upscale_method": "bilinear", "upscale_antialias": False},
	"Latent (antialiased)": {"upscale_method": "bilinear", "upscale_antialias": True},
	"Latent (bicubic)": {"upscale_method": "bicubic", "upscale_antialias": False},
	"Latent (bicubic antialiased)": {
	"upscale_method": "bicubic",
	"upscale_antialias": True,
	},
	"Latent (nearest)": {"upscale_method": "nearest", "upscale_antialias": False},
	"Latent (nearest-exact)": {
	"upscale_method": "nearest-exact",
	"upscale_antialias": False,
	},
	}

	css = """
	.finetuned-diffusion-div div{
	display:inline-flex;
	align-items:center;
	gap:.8rem;
	font-size:1.75rem;
	padding-top:2rem;
	}
	.finetuned-diffusion-div div h1{
	font-weight:900;
	margin-bottom:7px
	}
	.finetuned-diffusion-div p{
	margin-bottom:10px;
	font-size:94%
	}
	.box {
	float: left;
	height: 20px;
	width: 20px;
	margin-bottom: 15px;
	border: 1px solid black;
	clear: both;
	}
	a{
	text-decoration:underline
	}
	.tabs{
	margin-top:0;
	margin-bottom:0
	}
	#gallery{
	min-height:20rem
	}
	.no-border {
	border: none !important;
	}
	"""
	with gr.Blocks(css=css) as demo:
	gr.HTML(
	f"""
	<div class="finetuned-diffusion-div">
	<div>
	<h1>Demo for diffusion models</h1>
	</div>
	<p>Hso @ nyanko.sketch2img.gradio</p>
	</div>
	"""
	)
	global_stats = gr.State(value={})

	with gr.Row():

	with gr.Column(scale=55):
	model = gr.Dropdown(
	choices=[k[0] for k in get_model_list()],
	label="Model",
	value=base_name,
	)
	image_out = gr.Image(height=512)
	# gallery = gr.Gallery(
	# label="Generated images", show_label=False, elem_id="gallery"
	# ).style(grid=[1], height="auto")

	with gr.Column(scale=45):

	with gr.Group():

	with gr.Row():
	with gr.Column(scale=70):

	prompt = gr.Textbox(
	label="Prompt",
	value="loli cat girl, blue eyes, flat chest, solo, long messy silver hair, blue capelet, cat ears, cat tail, upper body",
	show_label=True,
	max_lines=4,
	placeholder="Enter prompt.",
	)
	neg_prompt = gr.Textbox(
	label="Negative Prompt",
	value="bad quality, low quality, jpeg artifact, cropped",
	show_label=True,
	max_lines=4,
	placeholder="Enter negative prompt.",
	)

	generate = gr.Button(value="Generate").style(
	rounded=(False, True, True, False)
	)

	with gr.Tab("Options"):

	with gr.Group():

	# n_images = gr.Slider(label="Images", value=1, minimum=1, maximum=4, step=1)
	with gr.Row():
	guidance = gr.Slider(
	label="Guidance scale", value=7.5, maximum=15
	)
	steps = gr.Slider(
	label="Steps", value=25, minimum=2, maximum=50, step=1
	)

	with gr.Row():
	width = gr.Slider(
	label="Width", value=512, minimum=64, maximum=1024, step=64
	)
	height = gr.Slider(
	label="Height", value=512, minimum=64, maximum=1024, step=64
	)

	sampler = gr.Dropdown(
	value="DPM++ 2M Karras",
	label="Sampler",
	choices=[s[0] for s in samplers_k_diffusion],
	)
	seed = gr.Number(label="Seed (0 = random)", value=0)

	with gr.Tab("Image to image"):
	with gr.Group():

	inf_image = gr.Image(
	label="Image", height=256, tool="editor", type="pil"
	)
	inf_strength = gr.Slider(
	label="Transformation strength",
	minimum=0,
	maximum=1,
	step=0.01,
	value=0.5,
	)

	def res_cap(g, w, h, x):
	if g:
	return f"Enable upscaler: {w}x{h} to {int(wx)}x{int(hx)}"
	else:
	return "Enable upscaler"

	with gr.Tab("Hires fix"):
	with gr.Group():

	hr_enabled = gr.Checkbox(label="Enable upscaler", value=False)
	hr_method = gr.Dropdown(
	[key for key in latent_upscale_modes.keys()],
	value="Latent",
	label="Upscale method",
	)
	hr_scale = gr.Slider(
	label="Upscale factor",
	minimum=1.0,
	maximum=3,
	step=0.1,
	value=1.5,
	)
	hr_denoise = gr.Slider(
	label="Denoising strength",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=0.8,
	)

	hr_scale.change(
	lambda g, x, w, h: gr.Checkbox.update(
	label=res_cap(g, w, h, x)
	),
	inputs=[hr_enabled, hr_scale, width, height],
	outputs=hr_enabled,
	)
	hr_enabled.change(
	lambda g, x, w, h: gr.Checkbox.update(
	label=res_cap(g, w, h, x)
	),
	inputs=[hr_enabled, hr_scale, width, height],
	outputs=hr_enabled,
	)

	with gr.Tab("Embeddings/Loras"):

	ti_state = gr.State(dict())
	lora_state = gr.State()

	with gr.Group():
	with gr.Row():
	with gr.Column(scale=90):
	ti_vals = gr.Text(label="Loaded embeddings")

	with gr.Row():
	with gr.Column(scale=90):
	lora_vals = gr.Text(label="Loaded loras")

	with gr.Row():

	uploads = gr.Files(label="Upload new embeddings/lora")

	with gr.Column():
	lora_scale = gr.Slider(
	label="Lora scale",
	minimum=0,
	maximum=2,
	step=0.01,
	value=1.0,
	)
	btn = gr.Button(value="Upload")
	btn_del = gr.Button(value="Reset")

	btn.click(
	add_net,
	inputs=[uploads, ti_state, lora_state],
	outputs=[ti_state, lora_state, ti_vals, lora_vals, uploads],
	)
	btn_del.click(
	clean_states,
	inputs=[ti_state, lora_state],
	outputs=[ti_state, lora_state, ti_vals, lora_vals, uploads],
	)

	# error_output = gr.Markdown()

	gr.HTML(
	f"""
	<div class="finetuned-diffusion-div">
	<div>
	<h1>Paint with words</h1>
	</div>
	<p>
	Will use the following formula: w = scale * token_weight_martix * log(1 + sigma) * max(qk).
	</p>
	</div>
	"""
	)

	with gr.Row():

	with gr.Column(scale=55):

	rendered = gr.Image(
	invert_colors=True,
	source="canvas",
	interactive=False,
	image_mode="RGBA",
	)

	with gr.Column(scale=45):

	with gr.Group():
	with gr.Row():
	with gr.Column(scale=70):
	g_strength = gr.Slider(
	label="Weight scaling",
	minimum=0,
	maximum=0.8,
	step=0.01,
	value=0.4,
	)

	text = gr.Textbox(
	lines=2,
	interactive=True,
	label="Token to Draw: (Separate by comma)",
	)

	radio = gr.Radio([], label="Tokens")

	sk_update = gr.Button(value="Update").style(
	rounded=(False, True, True, False)
	)

	# g_strength.change(lambda b: gr.update(f"Scaled additional attn: $w = {b} \log (1 + \sigma) \std (Q^T K)$."), inputs=g_strength, outputs=[g_output])

	with gr.Tab("SketchPad"):

	sp = gr.Image(
	image_mode="L",
	tool="sketch",
	source="canvas",
	interactive=False,
	)

	mask_outsides = gr.Checkbox(
	label="Mask other areas",
	value=False
	)

	strength = gr.Slider(
	label="Token strength",
	minimum=0,
	maximum=0.8,
	step=0.01,
	value=0.5,
	)


	sk_update.click(
	detect_text,
	inputs=[text, global_stats, width, height],
	outputs=[global_stats, sp, radio, rendered],
	)
	radio.change(
	switch_canvas,
	inputs=[radio, global_stats, width, height],
	outputs=[sp, strength, mask_outsides, rendered],
	)
	sp.edit(
	apply_canvas,
	inputs=[radio, sp, global_stats, width, height],
	outputs=[global_stats, rendered],
	)
	strength.change(
	apply_weight,
	inputs=[radio, strength, global_stats],
	outputs=[global_stats],
	)
	mask_outsides.change(
	apply_option,
	inputs=[radio, mask_outsides, global_stats],
	outputs=[global_stats],
	)

	with gr.Tab("UploadFile"):

	sp2 = gr.Image(
	image_mode="L",
	source="upload",
	shape=(512, 512),
	)

	mask_outsides2 = gr.Checkbox(
	label="Mask other areas",
	value=False
	)

	strength2 = gr.Slider(
	label="Token strength",
	minimum=0,
	maximum=0.8,
	step=0.01,
	value=0.5,
	)

	apply_style = gr.Button(value="Apply")
	apply_style.click(
	apply_image,
	inputs=[sp2, radio, width, height, strength2, mask_outsides2, global_stats],
	outputs=[global_stats, rendered],
	)

	width.change(
	apply_new_res,
	inputs=[width, height, global_stats],
	outputs=[global_stats, rendered],
	)
	height.change(
	apply_new_res,
	inputs=[width, height, global_stats],
	outputs=[global_stats, rendered],
	)

	# color_stats = gr.State(value={})
	# text.change(detect_color, inputs=[sp, text, color_stats], outputs=[color_stats, rendered])
	# sp.change(detect_color, inputs=[sp, text, color_stats], outputs=[color_stats, rendered])

	inputs = [
	prompt,
	guidance,
	steps,
	width,
	height,
	seed,
	neg_prompt,
	global_stats,
	g_strength,
	inf_image,
	inf_strength,
	hr_enabled,
	hr_method,
	hr_scale,
	hr_denoise,
	sampler,
	ti_state,
	model,
	lora_state,
	lora_scale,
	]
	outputs = [image_out]
	prompt.submit(inference, inputs=inputs, outputs=outputs)
	generate.click(inference, inputs=inputs, outputs=outputs)

	print(f"Space built in {time.time() - start_time:.2f} seconds")
	# demo.launch(share=True)
	demo.launch(enable_queue=True, server_name="0.0.0.0", server_port=7860)