Spaces:

josuelmet
/

Boomerang_Stable_Diffusion

Runtime error

App Files Files Community

Boomerang_Stable_Diffusion / boomerang.py

josuelmet

Update boomerang.py

a3f6bed about 2 years ago

raw

history blame contribute delete

10.1 kB

	import inspect
	from PIL import Image
	import os
	import torch
	from torch import autocast
	from torchvision import transforms as T
	from types import MethodType
	from typing import List, Optional, Tuple, Union

	from diffusers import StableDiffusionPipeline
	from diffusers.models.unet_2d_condition import UNet2DConditionOutput
	from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput

	COLAB = False
	if COLAB:
	pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, use_auth_token=os.environ.get('HF_TOKEN_SD'))
	else:
	pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=os.environ.get('HF_TOKEN_SD'))

	#pipe = pipe.to('cuda')




	# Overriding the U-Net forward pass
	def forward(
	self,
	sample: torch.FloatTensor,
	timestep: Union[torch.Tensor, float, int],
	encoder_hidden_states: torch.Tensor,
	return_dict: bool = True,
	) -> Union[UNet2DConditionOutput, Tuple]:
	"""r
	Args:
	sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
	timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
	encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

	Returns:
	[`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
	[`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
	returning a tuple, the first element is the sample tensor.
	"""
	# 0. center input if necessary
	if self.config.center_input_sample:
	sample = 2 * sample - 1.0

	# 1. time
	timesteps = timestep
	if not torch.is_tensor(timesteps):
	timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
	elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
	timesteps = timesteps.to(dtype=torch.float32)
	timesteps = timesteps[None].to(device=sample.device)

	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	timesteps = timesteps.expand(sample.shape[0])

	t_emb = self.time_proj(timesteps)
	#emb = self.time_embedding(t_emb)
	emb = self.time_embedding(t_emb.to(sample.dtype))

	# 2. pre-process
	sample = self.conv_in(sample)

	# 3. down
	down_block_res_samples = (sample,)
	for downsample_block in self.down_blocks:
	if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
	sample, res_samples = downsample_block(
	hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
	)
	else:
	sample, res_samples = downsample_block(hidden_states=sample, temb=emb)

	down_block_res_samples += res_samples

	# 4. mid
	sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)

	# 5. up
	for upsample_block in self.up_blocks:
	res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
	down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

	if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None:
	sample = upsample_block(
	hidden_states=sample,
	temb=emb,
	res_hidden_states_tuple=res_samples,
	encoder_hidden_states=encoder_hidden_states,
	)
	else:
	sample = upsample_block(hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples)

	# 6. post-process
	# make sure hidden states is in float32
	# when running in half-precision
	#sample = self.conv_norm_out(sample.float()).type(sample.dtype)
	sample = self.conv_norm_out(sample)
	sample = self.conv_act(sample)
	sample = self.conv_out(sample)

	if not return_dict:
	return (sample,)

	return UNet2DConditionOutput(sample=sample)


	def safety_forward(self, clip_input, images):
	return images, False


	# Overriding the Stable Diffusion call method
	@torch.no_grad()
	def call(
	self,
	prompt: Union[str, List[str]],
	height: Optional[int] = 512,
	width: Optional[int] = 512,
	num_inference_steps: Optional[int] = 50,
	guidance_scale: Optional[float] = 7.5,
	eta: Optional[float] = 0.0,
	generator: Optional[torch.Generator] = None,
	latents: Optional[torch.FloatTensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	percent_noise: float = 0.7,
	**kwargs,
	):
	if isinstance(prompt, str):
	batch_size = 1
	elif isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

	if height % 8 != 0 or width % 8 != 0:
	raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

	# get prompt text embeddings
	text_input = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]

	# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
	# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
	# corresponds to doing no classifier free guidance.
	do_classifier_free_guidance = guidance_scale > 1.0
	# get unconditional embeddings for classifier free guidance
	if do_classifier_free_guidance:
	max_length = text_input.input_ids.shape[-1]
	uncond_input = self.tokenizer(
	[""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
	)
	uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]

	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

	# get the initial random noise unless the user supplied it

	# Unlike in other pipelines, latents need to be generated in the target device
	# for 1-to-1 results reproducibility with the CompVis implementation.
	# However this currently doesn't work in `mps`.
	latents_device = "cpu" if self.device.type == "mps" else self.device
	latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
	if latents is None:
	latents = torch.randn(
	latents_shape,
	generator=generator,
	device=latents_device,
	)
	else:
	if latents.shape != latents_shape:
	raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
	latents = latents.to(self.device)

	# set timesteps
	self.scheduler.set_timesteps(num_inference_steps)

	# if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
	#if isinstance(self.scheduler, LMSDiscreteScheduler):
	# latents = latents * self.scheduler.sigmas[0]

	# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
	# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
	# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
	# and should be between [0, 1]
	accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
	extra_step_kwargs = {}
	if accepts_eta:
	extra_step_kwargs["eta"] = eta


	for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):

	if t - 1 > 1000 * percent_noise:
	continue

	#print(t)

	# expand the latents if we are doing classifier free guidance
	latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
	#if isinstance(self.scheduler, LMSDiscreteScheduler):
	# sigma = self.scheduler.sigmas[i]
	# # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
	# latent_model_input = latent_model_input / ((sigma2 + 1) 0.5)

	# predict the noise residual
	noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

	# perform guidance
	if do_classifier_free_guidance:
	noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	# compute the previous noisy sample x_t -> x_t-1
	#if isinstance(self.scheduler, LMSDiscreteScheduler):
	# latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
	#else:
	latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample


	# scale and decode the image latents with vae
	latents = 1 / 0.18215 * latents
	image = self.vae.decode(latents).sample

	image = (image / 2 + 0.5).clamp(0, 1)
	image = image.cpu().permute(0, 2, 3, 1).numpy()

	# run safety checker
	safety_cheker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device)
	image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values)

	if output_type == "pil":
	image = self.numpy_to_pil(image)

	if not return_dict:
	return (image, has_nsfw_concept)

	return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)


	if COLAB:
	pipe.unet.forward = MethodType(forward, pipe.unet)
	pipe.safety_checker.forward = MethodType(safety_forward, pipe.safety_checker)
	type(pipe).__call__ = call