Stable-Diffusion-1.5-LCM-ONNX-RKNN2 / run_onnx-lcm.py

Upload 38 files

1df274b verified 4 months ago

29.2 kB


	import argparse
	import json
	import time

	import PIL
	from diffusers import StableDiffusionPipeline
	from diffusers.pipelines.pipeline_utils import DiffusionPipeline
	from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
	from diffusers.schedulers import (
	LCMScheduler
	)
	from diffusers.schedulers.scheduling_utils import SchedulerMixin

	import gc
	import inspect

	import logging

	logging.basicConfig()
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)

	import numpy as np
	import os

	import torch # Only used for `torch.from_tensor` in `pipe.scheduler.step()`
	from transformers import CLIPFeatureExtractor, CLIPTokenizer
	from typing import Callable, List, Optional, Union, Tuple
	from PIL import Image

	# from rknnlite.api import RKNNLite

	# class RKNN2Model:
	# """ Wrapper for running RKNPU2 models """

	# def __init__(self, model_path):

	# logger.info(f"Loading {model_path}")

	# start = time.time()
	# assert os.path.exists(model_path) and model_path.endswith(".rknn")
	# self.rknnlite = RKNNLite()
	# self.rknnlite.load_rknn(model_path)
	# self.rknnlite.init_runtime(core_mask=RKNNLite.NPU_CORE_AUTO) # Multi-core will cause kernel crash
	# load_time = time.time() - start
	# logger.info(f"Done. Took {load_time:.1f} seconds.")
	# self.modelname = model_path.split("/")[-1]
	# self.inference_time = 0

	# def __call__(self, **kwargs) -> List[np.ndarray]:
	# np.savez(f"{self.modelname}_input_{self.inference_time}.npz", **kwargs)
	# #print(kwargs)
	# input_list = [value for key, value in kwargs.items()]
	# for i, input in enumerate(input_list):
	# if isinstance(input, np.ndarray):
	# print(f"input {i} shape: {input.shape}")
	# results = self.rknnlite.inference(inputs=input_list)
	# for res in results:
	# print(f"output shape: {res.shape}")
	# return results

	import onnxruntime as ort

	class RKNN2Model:
	""" Wrapper for running ONNX models """

	def __init__(self, model_dir):
	logger.info(f"Loading {model_dir}")
	start = time.time()
	self.config = json.load(open(os.path.join(model_dir, "config.json")))
	assert os.path.exists(model_dir) and os.path.exists(os.path.join(model_dir, "model.onnx"))
	self.session = ort.InferenceSession(os.path.join(model_dir, "model.onnx"))
	load_time = time.time() - start
	logger.info(f"Done. Took {load_time:.1f} seconds.")
	self.modelname = model_dir.split("/")[-1]
	self.inference_time = 0

	def __call__(self, **kwargs) -> List[np.ndarray]:
	# np.savez(f"onnx_out/{self.modelname}_input_{self.inference_time}.npz", **kwargs)
	self.inference_time += 1
	results = self.session.run(None, kwargs)
	results_list = []
	for res in results:
	results_list.append(res)
	return results

	class RKNN2StableDiffusionPipeline(DiffusionPipeline):
	""" RKNN2 version of
	`diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline`
	"""

	def __init__(
	self,
	text_encoder: RKNN2Model,
	unet: RKNN2Model,
	vae_decoder: RKNN2Model,
	scheduler: LCMScheduler,
	tokenizer: CLIPTokenizer,
	force_zeros_for_empty_prompt: Optional[bool] = True,
	feature_extractor: Optional[CLIPFeatureExtractor] = None,
	text_encoder_2: Optional[RKNN2Model] = None,
	tokenizer_2: Optional[CLIPTokenizer] = None

	):
	super().__init__()

	# Register non-Core ML components of the pipeline similar to the original pipeline
	self.register_modules(
	tokenizer=tokenizer,
	scheduler=scheduler,
	feature_extractor=feature_extractor,
	)
	self.force_zeros_for_empty_prompt = force_zeros_for_empty_prompt
	self.safety_checker = None

	# Register Core ML components of the pipeline
	self.text_encoder = text_encoder
	self.text_encoder_2 = text_encoder_2
	self.tokenizer_2 = tokenizer_2
	self.unet = unet
	self.vae_decoder = vae_decoder

	VAE_DECODER_UPSAMPLE_FACTOR = 8

	# In PyTorch, users can determine the tensor shapes dynamically by default
	# In CoreML, tensors have static shapes unless flexible shapes were used during export
	# See https://coremltools.readme.io/docs/flexible-inputs
	latent_h, latent_w = 32, 32 # hallo1: FIXME: hardcoded value
	self.height = latent_h * VAE_DECODER_UPSAMPLE_FACTOR
	self.width = latent_w * VAE_DECODER_UPSAMPLE_FACTOR
	self.vae_scale_factor = VAE_DECODER_UPSAMPLE_FACTOR
	logger.info(
	f"Stable Diffusion configured to generate {self.height}x{self.width} images"
	)

	@staticmethod
	def postprocess(
	image: np.ndarray,
	output_type: str = "pil",
	do_denormalize: Optional[List[bool]] = None,
	):
	def numpy_to_pil(images: np.ndarray):
	"""
	Convert a numpy image or a batch of images to a PIL image.
	"""
	if images.ndim == 3:
	images = images[None, ...]
	images = (images * 255).round().astype("uint8")
	if images.shape[-1] == 1:
	# special case for grayscale (single channel) images
	pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
	else:
	pil_images = [Image.fromarray(image) for image in images]

	return pil_images

	def denormalize(images: np.ndarray):
	"""
	Denormalize an image array to [0,1].
	"""
	return np.clip(images / 2 + 0.5, 0, 1)

	if not isinstance(image, np.ndarray):
	raise ValueError(
	f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
	)
	if output_type not in ["latent", "np", "pil"]:
	deprecation_message = (
	f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
	"`pil`, `np`, `pt`, `latent`"
	)
	logger.warning(deprecation_message)
	output_type = "np"

	if output_type == "latent":
	return image

	if do_denormalize is None:
	raise ValueError("do_denormalize is required for postprocessing")

	image = np.stack(
	[denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
	)
	image = image.transpose((0, 2, 3, 1))

	if output_type == "pil":
	image = numpy_to_pil(image)

	return image

	def _encode_prompt(
	self,
	prompt: Union[str, List[str]],
	num_images_per_prompt: int,
	do_classifier_free_guidance: bool,
	negative_prompt: Optional[Union[str, list]],
	prompt_embeds: Optional[np.ndarray] = None,
	negative_prompt_embeds: Optional[np.ndarray] = None,
	):
	r"""
	Encodes the prompt into text encoder hidden states.

	Args:
	prompt (`Union[str, List[str]]`):
	prompt to be encoded
	num_images_per_prompt (`int`):
	number of images that should be generated per prompt
	do_classifier_free_guidance (`bool`):
	whether to use classifier free guidance or not
	negative_prompt (`Optional[Union[str, list]]`):
	The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
	if `guidance_scale` is less than `1`).
	prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
	Pre-generated negative text embeddings. Can be used to easily tweak text inputs, e.g. prompt
	weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
	argument.
	"""
	if isinstance(prompt, str):
	batch_size = 1
	elif isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	if prompt_embeds is None:
	# get prompt text embeddings
	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer.model_max_length,
	truncation=True,
	return_tensors="np",
	)
	text_input_ids = text_inputs.input_ids
	untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids

	if not np.array_equal(text_input_ids, untruncated_ids):
	removed_text = self.tokenizer.batch_decode(
	untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
	)
	logger.warning(
	"The following part of your input was truncated because CLIP can only handle sequences up to"
	f" {self.tokenizer.model_max_length} tokens: {removed_text}"
	)

	prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]

	prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)

	# get unconditional embeddings for classifier free guidance
	if do_classifier_free_guidance and negative_prompt_embeds is None:
	uncond_tokens: List[str]
	if negative_prompt is None:
	uncond_tokens = [""] * batch_size
	elif type(prompt) is not type(negative_prompt):
	raise TypeError(
	f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
	f" {type(prompt)}."
	)
	elif isinstance(negative_prompt, str):
	uncond_tokens = [negative_prompt] * batch_size
	elif batch_size != len(negative_prompt):
	raise ValueError(
	f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
	f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
	" the batch size of `prompt`."
	)
	else:
	uncond_tokens = negative_prompt

	max_length = prompt_embeds.shape[1]
	uncond_input = self.tokenizer(
	uncond_tokens,
	padding="max_length",
	max_length=max_length,
	truncation=True,
	return_tensors="np",
	)
	negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]

	if do_classifier_free_guidance:
	negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)

	# For classifier free guidance, we need to do two forward passes.
	# Here we concatenate the unconditional and text embeddings into a single batch
	# to avoid doing two forward passes
	prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])

	return prompt_embeds

	# Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
	def check_inputs(
	self,
	prompt: Union[str, List[str]],
	height: Optional[int],
	width: Optional[int],
	callback_steps: int,
	negative_prompt: Optional[str] = None,
	prompt_embeds: Optional[np.ndarray] = None,
	negative_prompt_embeds: Optional[np.ndarray] = None,
	):
	if height % 8 != 0 or width % 8 != 0:
	raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

	if (callback_steps is None) or (
	callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
	):
	raise ValueError(
	f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
	f" {type(callback_steps)}."
	)

	if prompt is not None and prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
	" only forward one of the two."
	)
	elif prompt is None and prompt_embeds is None:
	raise ValueError(
	"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
	)
	elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
	raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

	if negative_prompt is not None and negative_prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
	f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
	)

	if prompt_embeds is not None and negative_prompt_embeds is not None:
	if prompt_embeds.shape != negative_prompt_embeds.shape:
	raise ValueError(
	"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
	f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
	f" {negative_prompt_embeds.shape}."
	)

	# Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
	def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
	shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(
	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
	f" size of {batch_size}. Make sure the batch size matches the length of the generators."
	)

	if latents is None:
	if isinstance(generator, np.random.RandomState):
	latents = generator.randn(*shape).astype(dtype)
	elif isinstance(generator, torch.Generator):
	latents = torch.randn(*shape, generator=generator).numpy().astype(dtype)
	else:
	raise ValueError(
	f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got"
	f" {type(generator)}."
	)
	elif latents.shape != shape:
	raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")

	# scale the initial noise by the standard deviation required by the scheduler
	latents = latents * np.float64(self.scheduler.init_noise_sigma)

	return latents

	# Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
	def __call__(
	self,
	prompt: Union[str, List[str]] = "",
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 4,
	original_inference_steps: int = None,
	guidance_scale: float = 8.5,
	num_images_per_prompt: int = 1,
	generator: Optional[Union[np.random.RandomState, torch.Generator]] = None,
	latents: Optional[np.ndarray] = None,
	prompt_embeds: Optional[np.ndarray] = None,
	output_type: str = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
	callback_steps: int = 1,
	):
	r"""
	Function invoked when calling the pipeline for generation.

	Args:
	prompt (`Optional[Union[str, List[str]]]`, defaults to None):
	The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
	instead.
	height (`Optional[int]`, defaults to None):
	The height in pixels of the generated image.
	width (`Optional[int]`, defaults to None):
	The width in pixels of the generated image.
	num_inference_steps (`int`, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	num_images_per_prompt (`int`, defaults to 1):
	The number of images to generate per prompt.
	generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):
	A np.random.RandomState to make generation deterministic.
	latents (`Optional[np.ndarray]`, defaults to `None`):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
	Pre-generated text embeddings. Can be used to easily tweak text inputs, e.g. prompt weighting. If not
	provided, text embeddings will be generated from `prompt` input argument.
	output_type (`str`, defaults to `"pil"`):
	The output format of the generate image. Choose between
	[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
	return_dict (`bool`, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (Optional[Callable], defaults to `None`):
	A function that will be called every `callback_steps` steps during inference. The function will be
	called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
	callback_steps (`int`, defaults to 1):
	The frequency at which the `callback` function will be called. If not specified, the callback will be
	called at every step.
	guidance_rescale (`float`, defaults to 0.0):
	Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
	Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
	[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
	Guidance rescale factor should fix overexposure when using zero terminal SNR.

	Returns:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
	When returning a tuple, the first element is a list with the generated images, and the second element is a
	list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
	(nsfw) content, according to the `safety_checker`.
	"""
	height = height or self.unet.config["sample_size"] * self.vae_scale_factor
	width = width or self.unet.config["sample_size"] * self.vae_scale_factor

	# Don't need to get negative prompts due to LCM guided distillation
	negative_prompt = None
	negative_prompt_embeds = None

	# check inputs. Raise error if not correct
	self.check_inputs(
	prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
	)

	# define call parameters
	if isinstance(prompt, str):
	batch_size = 1
	elif isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	if generator is None:
	generator = np.random.RandomState()

	prompt_embeds = self._encode_prompt(
	prompt,
	num_images_per_prompt,
	False,
	negative_prompt,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	)

	# set timesteps
	self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
	timesteps = self.scheduler.timesteps

	latents = self.prepare_latents(
	batch_size * num_images_per_prompt,
	self.unet.config["in_channels"],
	height,
	width,
	prompt_embeds.dtype,
	generator,
	latents,
	)

	bs = batch_size * num_images_per_prompt
	# get Guidance Scale Embedding
	w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
	w_embedding = self.get_guidance_scale_embedding(
	w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
	)

	# Adapted from diffusers to extend it for other runtimes than ORT
	timestep_dtype = np.int64

	num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
	for i, t in enumerate(self.progress_bar(timesteps)):
	timestep = np.array([t], dtype=timestep_dtype)
	noise_pred = self.unet(
	sample=latents,
	timestep=timestep,
	encoder_hidden_states=prompt_embeds,
	timestep_cond=w_embedding,
	)[0]

	# compute the previous noisy sample x_t -> x_t-1
	latents, denoised = self.scheduler.step(
	torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
	)
	latents, denoised = latents.numpy(), denoised.numpy()

	# call the callback, if provided
	if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
	if callback is not None and i % callback_steps == 0:
	callback(i, t, latents)

	if output_type == "latent":
	image = denoised
	has_nsfw_concept = None
	else:
	denoised /= self.vae_decoder.config["scaling_factor"]
	# it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
	image = np.concatenate(
	[self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
	)
	# image, has_nsfw_concept = self.run_safety_checker(image)
	has_nsfw_concept = None # skip safety checker

	if has_nsfw_concept is None:
	do_denormalize = [True] * image.shape[0]
	else:
	do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]

	image = self.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)

	if not return_dict:
	return (image, has_nsfw_concept)

	return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)


	# Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
	def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
	"""
	See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

	Args:
	timesteps (`torch.Tensor`):
	generate embedding vectors at these timesteps
	embedding_dim (`int`, optional, defaults to 512):
	dimension of the embeddings to generate
	dtype:
	data type of the generated embeddings

	Returns:
	`torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
	"""
	w = w * 1000
	half_dim = embedding_dim // 2
	emb = np.log(10000.0) / (half_dim - 1)
	emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
	emb = w[:, None] * emb[None, :]
	emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)

	if embedding_dim % 2 == 1: # zero pad
	emb = np.pad(emb, [(0, 0), (0, 1)])

	assert emb.shape == (w.shape[0], embedding_dim)
	return emb

	def get_image_path(args, **override_kwargs):
	""" mkdir output folder and encode metadata in the filename
	"""
	out_folder = os.path.join(args.o, "_".join(args.prompt.replace("/", "_").rsplit(" ")))
	os.makedirs(out_folder, exist_ok=True)

	out_fname = f"randomSeed_{override_kwargs.get('seed', None) or args.seed}"

	out_fname += f"_LCM_"
	out_fname += f"_numInferenceSteps{override_kwargs.get('num_inference_steps', None) or args.num_inference_steps}"
	out_fname += "_onnx_"

	return os.path.join(out_folder, out_fname + ".png")


	def prepare_controlnet_cond(image_path, height, width):
	image = Image.open(image_path).convert("RGB")
	image = image.resize((height, width), resample=Image.LANCZOS)
	image = np.array(image).transpose(2, 0, 1) / 255.0
	return image


	def main(args):
	logger.info(f"Setting random seed to {args.seed}")

	# load scheduler from /scheduler/scheduler_config.json
	scheduler_config_path = os.path.join(args.i, "scheduler/scheduler_config.json")
	with open(scheduler_config_path, "r") as f:
	scheduler_config = json.load(f)
	user_specified_scheduler = LCMScheduler.from_config(scheduler_config)

	print("user_specified_scheduler", user_specified_scheduler)

	pipe = RKNN2StableDiffusionPipeline(
	text_encoder=RKNN2Model(os.path.join(args.i, "text_encoder")),
	unet=RKNN2Model(os.path.join(args.i, "unet")),
	vae_decoder=RKNN2Model(os.path.join(args.i, "vae_decoder")),
	scheduler=user_specified_scheduler,
	tokenizer=CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16"),
	)

	logger.info("Beginning image generation.")
	image = pipe(
	prompt=args.prompt,
	height=int(args.size.split("x")[0]),
	width=int(args.size.split("x")[1]),
	num_inference_steps=args.num_inference_steps,
	guidance_scale=args.guidance_scale,
	generator=np.random.RandomState(args.seed),
	)

	out_path = get_image_path(args)
	logger.info(f"Saving generated image to {out_path}")
	image["images"][0].save(out_path)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()

	parser.add_argument(
	"--prompt",
	required=True,
	help="The text prompt to be used for text-to-image generation.")
	parser.add_argument(
	"-i",
	required=True,
	help=("Path to model directory"))
	parser.add_argument("-o", required=True)
	parser.add_argument("--seed",
	default=93,
	type=int,
	help="Random seed to be able to reproduce results")
	parser.add_argument(
	"-s",
	"--size",
	default="256x256",
	type=str,
	help="Image size")
	parser.add_argument(
	"--num-inference-steps",
	default=4,
	type=int,
	help="The number of iterations the unet model will be executed throughout the reverse diffusion process")
	parser.add_argument(
	"--guidance-scale",
	default=7.5,
	type=float,
	help="Controls the influence of the text prompt on sampling process (0=random images)")

	args = parser.parse_args()
	main(args)