Spaces:

multimodalart
/

diffusion

Runtime error

App Files Files Community

diffusion / app.py

apolinario

Initial diffusion

bc5a411 over 2 years ago

raw

history blame

5.23 kB

	import gc
	import math
	import sys

	from IPython import display
	import torch
	from torchvision import utils as tv_utils
	from torchvision.transforms import functional as TF
	from tqdm.notebook import trange, tqdm
	import gradio as gr
	sys.path.append('/content/v-diffusion-pytorch')
	from huggingface_hub import hf_hub_download

	from CLIP import clip
	from diffusion import get_model, sampling, utils

	cc12m_model = hf_hub_download(repo_id="multimodalart/crowsonkb-v-diffusion-cc12m-1-cfg", filename="cc12m_1_cfg.pth")
	model = get_model('cc12m_1_cfg')()
	_, side_y, side_x = model.shape
	model.load_state_dict(torch.load(cc12m_model, map_location='cpu'))
	model = model.half().cuda().eval().requires_grad_(False)
	clip_model = clip.load(model.clip_model, jit=False, device='cpu')[0]

	def run_all(prompt, steps, n_images, weight):
	import random
	seed = int(random.randint(0, 2147483647))
	target_embed = clip_model.encode_text(clip.tokenize(prompt)).float().cuda()

	def cfg_model_fn(x, t):
	"""The CFG wrapper function."""
	n = x.shape[0]
	x_in = x.repeat([2, 1, 1, 1])
	t_in = t.repeat([2])
	clip_embed_repeat = target_embed.repeat([n, 1])
	clip_embed_in = torch.cat([torch.zeros_like(clip_embed_repeat), clip_embed_repeat])
	v_uncond, v_cond = model(x_in, t_in, clip_embed_in).chunk(2, dim=0)
	v = v_uncond + (v_cond - v_uncond) * weight
	return v

	gc.collect()
	torch.cuda.empty_cache()
	torch.manual_seed(seed)
	x = torch.randn([n_images, 3, side_y, side_x], device='cuda')
	t = torch.linspace(1, 0, steps + 1, device='cuda')[:-1]
	step_list = utils.get_spliced_ddpm_cosine_schedule(t)
	outs = sampling.plms_sample(cfg_model_fn, x, step_list, {})#, callback=display_callback)
	images_out = []
	for i, out in enumerate(outs):
	images_out.append(utils.to_pil_image(out))
	return(images_out)


	##################### START GRADIO HERE ############################
	#image = gr.outputs.Image(type="pil", label="Your result")
	gallery = gr.Gallery(css={"height": "256px","width":"256px"})
	iface = gr.Interface(
	fn=run_all,
	inputs=[
	gr.inputs.Textbox(label="Prompt - try adding increments to your prompt such as 'oil on canvas', 'a painting', 'a book cover'",default="chalk pastel drawing of a dog wearing a funny hat"),
	gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=250,minimum=1,step=1),
	gr.inputs.Slider(label="Number of images in parallel", default=2, maximum=4, minimum=1),
	gr.inputs.Slider(label="Weight", default=5, maximum=15, minimum=0),
	#gr.inputs.Checkbox(label="CLIP Guided"),
	#gr.inputs.Dropdown(label="Flavor",choices=["ginger", "cumin", "holywater", "zynth", "wyvern", "aaron", "moth", "juu", "custom"]),
	#markdown,
	#gr.inputs.Dropdown(label="Style",choices=["Default","Balanced","Detailed","Consistent Creativity","Realistic","Smooth","Subtle MSE","Hyper Fast Results"],default="Hyper Fast Results"),
	#gr.inputs.Radio(label="Width", choices=[32,64,128,256,512],default=512),
	#gr.inputs.Radio(label="Height", choices=[32,64,128,256,512],default=512),
	],
	outputs=gallery,
	title="Generate images from text with V-Diffusion CC12M",
	#description="<div>By typing a prompt and pressing submit you can generate images based on this prompt. <a href='https://github.com/CompVis/latent-diffusion' target='_blank'>Latent Diffusion</a> is a text-to-image model created by <a href='https://github.com/CompVis' target='_blank'>CompVis</a>, trained on the <a href='https://laion.ai/laion-400-open-dataset/'>LAION-400M dataset.</a><br>This UI to the model was assembled by <a style='color: rgb(245, 158, 11);font-weight:bold' href='https://twitter.com/multimodalart' target='_blank'>@multimodalart</a></div>",
	#article="<h4 style='font-size: 110%;margin-top:.5em'>Biases acknowledgment</h4><div>Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exarcbates societal biases. According to the <a href='https://arxiv.org/abs/2112.10752' target='_blank'>Latent Diffusion paper</a>:<i> \"Deep learning modules tend to reproduce or exacerbate biases that are already present in the data\"</i>. The model was trained on an unfiltered version the LAION-400M dataset, which scrapped non-curated image-text-pairs from the internet (the exception being the the removal of illegal content) and is meant to be used for research purposes, such as this one. <a href='https://laion.ai/laion-400-open-dataset/' target='_blank'>You can read more on LAION's website</a></div><h4 style='font-size: 110%;margin-top:1em'>Who owns the images produced by this demo?</h4><div>Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"
	)
	iface.launch()