add code

Browse files

Files changed (10) hide show

.gitattributes +1 -0
lyraSD/__init__.py +1 -0
lyraSD/inference.py +89 -0
lyraSD/muse_trt/__init__.py +10 -0
lyraSD/muse_trt/libnvinfer_plugin.so +3 -0
lyraSD/muse_trt/models.py +815 -0
lyraSD/muse_trt/sd_img2img.py +365 -0
lyraSD/muse_trt/sd_text2img.py +290 -0
lyraSD/muse_trt/super.py +64 -0
lyraSD/muse_trt/utilities.py +536 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+lyraSD/muse_trt/libnvinfer_plugin.so filter=lfs diff=lfs merge=lfs -text

lyraSD/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .inference import LyraSD

lyraSD/inference.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+from PIL import Image
+from .muse_trt import TRTStableDiffusionText2ImgPipeline
+from .muse_trt import TRTStableDiffusionImg2ImgPipeline
+import numpy as np
+class LyraSD(object):
+    def __init__(self, sd_mode, engine_dir,o_height=512, o_width=512, device="cuda:0"):
+        self.sd_mode = sd_mode
+        self.device = device
+        self.o_height = o_height
+        self.o_width = o_width
+        if self.sd_mode == "text2img":
+            self.pipeline = TRTStableDiffusionText2ImgPipeline(
+                engine_dir = engine_dir,
+                o_height = o_height,
+                o_width = o_width,
+                device=device
+            )
+        elif self.sd_mode == "img2img":
+            self.pipeline = TRTStableDiffusionImg2ImgPipeline(
+                engine_dir = engine_dir,
+                o_height = o_height,
+                o_width = o_width,
+                device=device
+            )
+        else:
+            raise ValueError("Invalid sd_mode: {}".format(self.sd_mode))
+    def inference(self, prompt,
+                  image=None,
+                  save_dir="./output",
+                  save_basename="sd-",
+                  negative_prompts='',
+                  strength=0.3,
+                  height=None,
+                  width =None,
+                  num_images_per_prompt=1,
+                  num_inference_steps=50,
+                  guidance_scale=7.5,
+                  use_super=False,
+                  ):
+        if self.sd_mode=="text2img" and prompt is None:
+            raise ValueError("prompt must be set on text2img mode")
+        if self.sd_mode=="img2img" and image is None:
+            raise ValueError("image must be set on img2img mode")
+        save_basename += f"{self.sd_mode}"
+        if height is None:
+            height = self.o_height
+        if width is None:
+            width = self.o_width
+        # this version model doen't support batch mode.
+        if not isinstance(prompt, list):
+            prompt = [prompt]
+        if len(prompt) > 1:
+            raise ValueError("current model dosen't support multi prompts")
+        if self.sd_mode=="text2img":
+            result_image = self.pipeline(prompt=prompt, negative_prompt=negative_prompts,
+                                    num_inference_steps= num_inference_steps,
+                                    num_images_per_prompt=num_images_per_prompt,
+                                    guidance_scale=guidance_scale,
+                                    height=height,
+                                    width=width,
+                                    use_super=use_super)
+        elif self.sd_mode=="img2img":
+            result_image = self.pipeline(prompt=prompt,
+                                    image=image,
+                                    negative_prompt=negative_prompts,
+                                    strength = strength,
+                                    num_inference_steps= num_inference_steps,
+                                    num_images_per_prompt=num_images_per_prompt,
+                                    guidance_scale=guidance_scale,
+                                    height=height,
+                                    width=width,
+                                    use_super=use_super)
+        for i in range(result_image.shape[0]):
+            result_image = Image.fromarray(np.uint8(result_image[i]))
+            result_image.save(os.path.join(save_dir, save_basename + "-{}.jpg".format(i)))
+        return result_image

lyraSD/muse_trt/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import ctypes
+import os
+current_workdir = os.path.dirname(__file__)
+ctypes.cdll.LoadLibrary(os.path.join(current_workdir, "libnvinfer_plugin.so"))
+from .sd_img2img import TRTStableDiffusionImg2ImgPipeline
+from .sd_text2img import TRTStableDiffusionText2ImgPipeline
+from .super import SuperX4TRTInfer

lyraSD/muse_trt/libnvinfer_plugin.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53cbcc8a47524652bb8e0399a2fbbcfc0b785f11bdc491bbb6a71e4b888ee124
+size 85198184

lyraSD/muse_trt/models.py ADDED Viewed

	@@ -0,0 +1,815 @@

+r"""models components"""
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Any, Dict, Optional, Union
+import numpy as np
+import torch
+from cuda import cudart
+from diffusers import ControlNetModel
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from torch import nn
+from torch.nn import functional as F
+from transformers import CLIPTextModel
+class BaseModel():
+    def __init__(
+        self,
+        local_model_path=None,
+        hf_token=None,
+        text_maxlen=77,
+        embedding_dim=768,
+        fp16=False,
+        device='cuda',
+        verbose=True,
+        max_batch_size=16
+    ):
+        self.fp16 = fp16
+        self.device = device
+        self.verbose = verbose
+        self.hf_token = hf_token
+        self.local_model_path = local_model_path
+        # Defaults
+        self.text_maxlen = text_maxlen
+        self.embedding_dim = embedding_dim
+        self.min_batch = 1
+        self.max_batch = max_batch_size
+        self.min_latent_shape = 256 // 8  # min image resolution: 256x256
+        self.max_latent_shape = 1024 // 8  # max image resolution: 1024x1024
+    def get_model(self):
+        pass
+    def get_input_names(self):
+        pass
+    def get_output_names(self):
+        pass
+    def get_dynamic_axes(self):
+        return None
+    def get_sample_input(self, batch_size, image_height, image_width):
+        pass
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        return None
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        return None
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        assert image_height % 8 == 0 or image_width % 8 == 0
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+        assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+        return (latent_height, latent_width)
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        min_latent_height = latent_height if static_shape else self.min_latent_shape
+        max_latent_height = latent_height if static_shape else self.max_latent_shape
+        min_latent_width = latent_width if static_shape else self.min_latent_shape
+        max_latent_width = latent_width if static_shape else self.max_latent_shape
+        return (min_batch, max_batch, min_latent_height, max_latent_height, min_latent_width, max_latent_width)
+class CLIP(BaseModel):
+    def get_model(self):
+        if self.hf_token is None and self.local_model_path is not None:
+            clip_model = CLIPTextModel.from_pretrained(
+                self.local_model_path, subfolder="text_encoder").to(self.device)
+        else:
+            clip_model = CLIPTextModel.from_pretrained(
+                "openai/clip-vit-large-patch14").to(self.device)
+        return clip_model
+    def get_input_names(self):
+        return ['input_ids']
+    def get_output_names(self):
+        return ['text_embeddings', 'pooler_output']
+    def get_dynamic_axes(self):
+        return {
+            'input_ids': {0: 'B'},
+            'text_embeddings': {0: 'B'}
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, _, _, _, _ = self.get_minmax_dims(
+            batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            'input_ids': [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return {
+            'input_ids': (batch_size, self.text_maxlen),
+            'text_embeddings': (batch_size, self.text_maxlen, self.embedding_dim)
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device)
+class UNet(BaseModel):
+    def get_model(self):
+        model_opts = {'revision': 'fp16',
+                      'torch_dtype': torch.float16} if self.fp16 else {}
+        print(model_opts)
+        if self.hf_token is None and self.local_model_path is not None:
+            unet_model = UNet2DConditionModel.from_pretrained(
+                self.local_model_path, subfolder="unet",
+                **model_opts
+            ).to(self.device)
+        else:
+            unet_model = UNet2DConditionModel.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="unet",
+                use_auth_token=self.hf_token,
+                **model_opts).to(self.device)
+        return unet_model
+    def get_input_names(self):
+        return ['sample', 'timestep', 'encoder_hidden_states']
+    def get_output_names(self):
+        return ['latent']
+    def get_dynamic_axes(self):
+        return {
+            'sample': {0: '2B', 2: 'H', 3: 'W'},
+            'encoder_hidden_states': {0: '2B'},
+            'latent': {0: '2B', 2: 'H', 3: 'W'}
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        min_batch, max_batch, min_latent_height, max_latent_height, min_latent_width, max_latent_width = \
+            self.get_minmax_dims(batch_size, image_height,
+                                 image_width, static_batch, static_shape)
+        return {
+            'sample': [(2*min_batch, 4, min_latent_height, min_latent_width), (2*batch_size, 4, latent_height, latent_width), (2*max_batch, 4, max_latent_height, max_latent_width)],
+            'encoder_hidden_states': [(2*min_batch, self.text_maxlen, self.embedding_dim), (2*batch_size, self.text_maxlen, self.embedding_dim), (2*max_batch, self.text_maxlen, self.embedding_dim)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return {
+            'sample': (2*batch_size, 4, latent_height, latent_width),
+            'encoder_hidden_states': (2*batch_size, self.text_maxlen, self.embedding_dim),
+            'latent': (2*batch_size, 4, latent_height, latent_width)
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        dtype = torch.float16 if self.fp16 else torch.float32
+        return (
+            torch.randn(2*batch_size, 4, latent_height, latent_width,
+                        dtype=torch.float32, device=self.device),
+            torch.tensor([1.], dtype=torch.float32, device=self.device),
+            torch.randn(2*batch_size, self.text_maxlen,
+                        self.embedding_dim, dtype=dtype, device=self.device)
+        )
+class VAEEncoderModule(nn.Module):
+    def __init__(self, local_model_path, device) -> None:
+        super().__init__()
+        self.vae = AutoencoderKL.from_pretrained(
+            local_model_path, subfolder="vae"
+        ).to(device)
+    def forward(self, x):
+        h = self.vae.encoder(x)
+        moments = self.vae.quant_conv(h)
+        return moments
+class VAEEncoder(BaseModel):
+    def get_model(self):
+        vae_encoder = VAEEncoderModule(self.local_model_path, self.device)
+        return vae_encoder
+    def get_input_names(self):
+        return ['images']
+    def get_output_names(self):
+        return ['latent']
+    def get_dynamic_axes(self):
+        return {
+            'images': {0: 'B', 2: '8H', 3: '8W'},
+            'latent': {0: 'B', 2: 'H', 3: 'W'}
+        }
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        assert image_height % 8 == 0 or image_width % 8 == 0
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+        assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+        return (image_height, image_width)
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        min_image_height = image_height if static_shape else self.min_latent_shape
+        max_image_height = image_height if static_shape else self.max_latent_shape
+        min_image_width = image_width if static_shape else self.min_latent_shape
+        max_image_width = image_width if static_shape else self.max_latent_shape
+        return (min_batch, max_batch, min_image_height, max_image_height, min_image_width, max_image_width)
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        image_height, image_width = self.check_dims(
+            batch_size, image_height, image_width)
+        min_batch, max_batch, min_image_height, max_image_height, min_image_width, max_image_width = \
+            self.get_minmax_dims(batch_size, image_height,
+                                 image_width, static_batch, static_shape)
+        return {
+            'images': [(min_batch, 3, min_image_height, min_image_width), (batch_size, 3, image_height, image_width), (max_batch, 3, max_image_height, max_image_width)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        image_height, image_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return {
+            'images': (batch_size, 3, image_height, image_width),
+            'latent': (batch_size, 8, image_height//8, image_width//8),
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        image_height, image_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return torch.randn(batch_size, 3, image_height, image_width, dtype=torch.float32, device=self.device)
+    def optimize(self, onnx_graph, minimal_optimization=False):
+        enable_optimization = not minimal_optimization
+        # Decompose InstanceNormalization into primitive Ops
+        bRemoveInstanceNorm = enable_optimization
+        # Remove Cast Node to optimize Attention block
+        bRemoveCastNode = enable_optimization
+        # Insert GroupNormalization Plugin
+        bGroupNormPlugin = enable_optimization
+        opt = Optimizer(onnx_graph, verbose=self.verbose)
+        opt.info('VAE Encoder: original')
+        if bRemoveInstanceNorm:
+            num_instancenorm_replaced = opt.decompose_instancenorms()
+            opt.info('VAE Encoder: replaced ' +
+                     str(num_instancenorm_replaced)+' InstanceNorms')
+        if bRemoveCastNode:
+            num_casts_removed = opt.remove_casts()
+            opt.info('VAE Encoder: removed '+str(num_casts_removed)+' casts')
+        opt.cleanup()
+        opt.info('VAE Encoder: cleanup')
+        opt.fold_constants()
+        opt.info('VAE Encoder: fold constants')
+        opt.infer_shapes()
+        opt.info('VAE Encoder: shape inference')
+        if bGroupNormPlugin:
+            num_groupnorm_inserted = opt.insert_groupnorm_plugin()
+            opt.info('VAE Encoder: inserted '+str(num_groupnorm_inserted) +
+                     ' GroupNorm plugins')
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        opt.info('VAE Encoder: final')
+        return onnx_opt_graph
+class VAEDecoder(BaseModel):
+    def get_model(self):
+        if self.hf_token is None and self.local_model_path is not None:
+            vae = AutoencoderKL.from_pretrained(
+                self.local_model_path, subfolder="vae"
+            ).to(self.device)
+        else:
+            vae = AutoencoderKL.from_pretrained(
+                "CompVis/stable-diffusion-v1-4",
+                subfolder="vae",
+                use_auth_token=self.hf_token).to(self.device)
+        vae.forward = vae.decode
+        return vae
+    def get_input_names(self):
+        return ['latent']
+    def get_output_names(self):
+        return ['images']
+    def get_dynamic_axes(self):
+        return {
+            'latent': {0: 'B', 2: 'H', 3: 'W'},
+            'images': {0: 'B', 2: '8H', 3: '8W'}
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        min_batch, max_batch, min_latent_height, max_latent_height, min_latent_width, max_latent_width = \
+            self.get_minmax_dims(batch_size, image_height,
+                                 image_width, static_batch, static_shape)
+        return {
+            'latent': [(min_batch, 4, min_latent_height, min_latent_width), (batch_size, 4, latent_height, latent_width), (max_batch, 4, max_latent_height, max_latent_width)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return {
+            'latent': (batch_size, 4, latent_height, latent_width),
+            'images': (batch_size, 3, image_height, image_width)
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device)
+    def optimize(self, onnx_graph, minimal_optimization=False):
+        enable_optimization = not minimal_optimization
+        # Decompose InstanceNormalization into primitive Ops
+        bRemoveInstanceNorm = enable_optimization
+        # Remove Cast Node to optimize Attention block
+        bRemoveCastNode = enable_optimization
+        # Insert GroupNormalization Plugin
+        bGroupNormPlugin = enable_optimization
+        opt = Optimizer(onnx_graph, verbose=self.verbose)
+        opt.info('VAE Decoder: original')
+        if bRemoveInstanceNorm:
+            num_instancenorm_replaced = opt.decompose_instancenorms()
+            opt.info('VAE Decoder: replaced ' +
+                     str(num_instancenorm_replaced)+' InstanceNorms')
+        if bRemoveCastNode:
+            num_casts_removed = opt.remove_casts()
+            opt.info('VAE Decoder: removed '+str(num_casts_removed)+' casts')
+        opt.cleanup()
+        opt.info('VAE Decoder: cleanup')
+        opt.fold_constants()
+        opt.info('VAE Decoder: fold constants')
+        opt.infer_shapes()
+        opt.info('VAE Decoder: shape inference')
+        if bGroupNormPlugin:
+            num_groupnorm_inserted = opt.insert_groupnorm_plugin()
+            opt.info('VAE Decoder: inserted '+str(num_groupnorm_inserted) +
+                     ' GroupNorm plugins')
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        opt.info('VAE Decoder: final')
+        return onnx_opt_graph
+class SuperModelX4(nn.Module):
+    def __init__(self, model_dir, scale=4, pre_pad=0):
+        super().__init__()
+        self.scale = scale
+        self.pre_pad = pre_pad
+        rrdb = RealESRGAN(model_dir=model_dir,
+                          model_name="RealESRGAN_x4plus_anime_6B").upsampler.model
+        self.rrdb = rrdb.eval()
+    def forward(self, x):
+        x = x / 255.
+        x = F.pad(x, (0, self.pre_pad, 0, self.pre_pad), 'reflect')
+        x = self.rrdb(x)
+        _, _, h, w = x.size()
+        x = x[:, :, 0:h-self.pre_pad * self.scale, 0:w-self.pre_pad*self.scale]
+        x = x.clamp(0, 1)
+        x = (x * 255).round()
+        return x
+class SuperResX4():
+    def __init__(
+        self,
+        local_model_path=None,
+        fp16=True,
+        device='cuda',
+        verbose=True,
+        max_batch_size=8
+    ):
+        self.fp16 = fp16
+        self.device = device
+        self.verbose = verbose
+        self.local_model_path = local_model_path
+        # Defaults
+        self.min_batch = 1
+        self.max_batch = max_batch_size
+        self.min_height = 64
+        self.max_height = 640
+        self.min_width = 64
+        self.max_width = 640
+    def get_model(self):
+        model = SuperModelX4(self.local_model_path, scale=4, pre_pad=0).to(device=self.device)
+        if self.fp16:
+            model = model.half()
+        return model
+    def get_input_names(self):
+        return ['input_image']
+    def get_output_names(self):
+        return ['output_image']
+    def get_dynamic_axes(self):
+        return {
+            'input_image': {0: 'B', },
+            'output_image': {0: 'B', }
+        }
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        return (image_height, image_width)
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        min_image_height = image_height if static_shape else self.min_height
+        max_image_height = image_height if static_shape else self.max_height
+        min_image_width = image_width if static_shape else self.min_width
+        max_image_width = image_width if static_shape else self.max_width
+        return (min_batch, max_batch, min_image_height, max_image_height, min_image_width, max_image_width)
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        image_height, image_width = self.check_dims(
+            batch_size, image_height, image_width)
+        min_batch, max_batch, min_image_height, max_image_height, min_image_width, max_image_width = \
+            self.get_minmax_dims(batch_size, image_height,
+                                 image_width, static_batch, static_shape)
+        return {
+            'input_image': [(min_batch, 3, min_image_height, min_image_width), (batch_size, 3, image_height, image_width), (max_batch, 3, max_image_height, max_image_width)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        image_height, image_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return {
+            'input_image': (batch_size, 3, image_height, image_width),
+            'output_image': (batch_size, 3, image_height*4, image_width*4),
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        dtype = torch.float16 if self.fp16 else torch.float32
+        image_height, image_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return torch.randn(batch_size, 3, image_height, image_width, dtype=dtype, device=self.device)
+    def optimize(self, onnx_graph, minimal_optimization=False):
+        enable_optimization = not minimal_optimization
+        # Decompose InstanceNormalization into primitive Ops
+        bRemoveInstanceNorm = enable_optimization
+        # Remove Cast Node to optimize Attention block
+        bRemoveCastNode = enable_optimization
+        # Insert GroupNormalization Plugin
+        bGroupNormPlugin = enable_optimization
+        opt = Optimizer(onnx_graph, verbose=self.verbose)
+        opt.info('SuperX4: original')
+        if bRemoveInstanceNorm:
+            num_instancenorm_replaced = opt.decompose_instancenorms()
+            opt.info('SuperX4: replaced ' +
+                     str(num_instancenorm_replaced)+' InstanceNorms')
+        if bRemoveCastNode:
+            num_casts_removed = opt.remove_casts()
+            opt.info('SuperX4: removed '+str(num_casts_removed)+' casts')
+        opt.cleanup()
+        opt.info('SuperX4: cleanup')
+        opt.fold_constants()
+        opt.info('SuperX4: fold constants')
+        opt.infer_shapes()
+        opt.info('SuperX4: shape inference')
+        if bGroupNormPlugin:
+            num_groupnorm_inserted = opt.insert_groupnorm_plugin()
+            opt.info('SuperX4: inserted '+str(num_groupnorm_inserted) +
+                     ' GroupNorm plugins')
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        opt.info('SuperX4: final')
+        return onnx_opt_graph
+class FusedControlNetModule(nn.Module):
+    def __init__(self, base_model_dir, control_model_dir, fp16=True) -> None:
+        super().__init__()
+        self.device = 'cuda:0'
+        self.fp16 = fp16
+        model_opts = {'revision': 'fp16',
+                      'torch_dtype': torch.float16} if self.fp16 else {}
+        self.base = UNet2DConditionModel.from_pretrained(
+            base_model_dir, subfolder="unet",
+            **model_opts
+        ).eval().to(self.device)
+        self.control = ControlNetModel.from_pretrained(
+            control_model_dir,
+            **model_opts
+        ).eval().to(self.device)
+    def forward(self, sample, timestep, encoder_hidden_states, controlnet_cond):
+        controlnet_conditioning_scale: float = 1.0
+        down_block_res_samples, mid_block_res_sample = self.control(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            controlnet_cond=controlnet_cond,
+            return_dict=False,
+        )
+        down_block_res_samples = [
+            down_block_res_sample * controlnet_conditioning_scale
+            for down_block_res_sample in down_block_res_samples
+        ]
+        mid_block_res_sample *= controlnet_conditioning_scale
+        # predict the noise residual
+        noise_pred = self.base(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            down_block_additional_residuals=down_block_res_samples,
+            mid_block_additional_residual=mid_block_res_sample,
+        ).sample
+        return noise_pred
+class FusedControlNet(BaseModel):
+    def __init__(self, local_model_path=None, controlnet_model_path=None, hf_token=None, text_maxlen=77,
+                 embedding_dim=768, fp16=False, device='cuda', verbose=True, max_batch_size=16):
+        super().__init__(local_model_path, hf_token, text_maxlen, embedding_dim, fp16, device, verbose, max_batch_size)
+        # if controlnet_model_path is None:
+        #     raise ValueError("Must give controlnet_model_path for FusedControlNet to load control net")
+        self.controlnet_model_path = controlnet_model_path
+        self.min_height = 256
+        self.max_height = 1024
+        self.min_width = 256
+        self.max_width = 1024
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        r = list(super().get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape))
+        min_height = image_height if static_shape else self.min_height
+        max_height = image_height if static_shape else self.max_height
+        min_width = image_width if static_shape else self.min_width
+        max_width = image_width if static_shape else self.max_width
+        r.extend([min_height, max_height, min_width, max_width])
+        return r
+    def get_model(self):
+        model = FusedControlNetModule(
+            base_model_dir=self.local_model_path,
+            control_model_dir=self.controlnet_model_path,
+            fp16=self.fp16
+        )
+        return model
+    def get_input_names(self):
+        return ['sample', 'timestep', 'encoder_hidden_states', 'controlnet_cond']
+    def get_output_names(self):
+        return ['latent']
+    def get_dynamic_axes(self):
+        return {
+            'sample': {0: '2B', 2: 'H', 3: 'W'},
+            'encoder_hidden_states': {0: '2B'},
+            'controlnet_cond': {0: '2B', 2: '8H', 3: '8W'},  # controlnet_cond is 8X sample and lantent
+            'latent': {0: '2B', 2: 'H', 3: 'W'}
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        min_batch, max_batch, min_latent_height, max_latent_height, min_latent_width, max_latent_width, min_height, max_height, min_width, max_width = \
+            self.get_minmax_dims(batch_size, image_height,
+                                 image_width, static_batch, static_shape)
+        return {
+            'sample': [(2*min_batch, 4, min_latent_height, min_latent_width), (2*batch_size, 4, latent_height, latent_width), (2*max_batch, 4, max_latent_height, max_latent_width)],
+            'encoder_hidden_states': [(2*min_batch, self.text_maxlen, self.embedding_dim), (2*batch_size, self.text_maxlen, self.embedding_dim), (2*max_batch, self.text_maxlen, self.embedding_dim)],
+            'controlnet_cond': [(2*min_batch, 3, min_height, min_width), (2*batch_size, 3, image_height, image_width), (2*max_batch, 3, max_height, max_width)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return {
+            'sample': (2*batch_size, 4, latent_height, latent_width),
+            'encoder_hidden_states': (2*batch_size, self.text_maxlen, self.embedding_dim),
+            'controlnet_cond': (2*batch_size, 3, image_height, image_width),
+            'latent': (2*batch_size, 4, latent_height, latent_width)
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        dtype = torch.float16 if self.fp16 else torch.float32
+        return (
+            torch.randn(2*batch_size, 4, latent_height, latent_width,
+                        dtype=torch.float32, device=self.device),   # sample
+            torch.tensor([1.], dtype=torch.float32, device=self.device),    # timestep
+            torch.randn(2*batch_size, self.text_maxlen,  # encoder_hidden_states
+                        self.embedding_dim, dtype=dtype, device=self.device),
+            torch.randn(2*batch_size, 3, image_height, image_width,
+                        dtype=torch.float32, device=self.device)    # controlnet_cond
+        )
+    def optimize(self, onnx_graph, minimal_optimization=False):
+        class_name = self.__class__.__name__
+        enable_optimization = not minimal_optimization
+        # Decompose InstanceNormalization into primitive Ops
+        bRemoveInstanceNorm = enable_optimization
+        # Remove Cast Node to optimize Attention block
+        bRemoveCastNode = enable_optimization
+        # Remove parallel Swish ops
+        bRemoveParallelSwish = enable_optimization
+        # Adjust the bias to be the second input to the Add ops
+        bAdjustAddNode = enable_optimization
+        # Change Resize node to take size instead of scale
+        bResizeFix = enable_optimization
+        # Common override for disabling all plugins below
+        bDisablePlugins = minimal_optimization
+        # Use multi-head attention Plugin
+        bMHAPlugin = True
+        # Use multi-head cross attention Plugin
+        bMHCAPlugin = True
+        # Insert GroupNormalization Plugin
+        bGroupNormPlugin = True
+        # Insert LayerNormalization Plugin
+        bLayerNormPlugin = True
+        # Insert Split+GeLU Plugin
+        bSplitGeLUPlugin = True
+        # Replace BiasAdd+ResidualAdd+SeqLen2Spatial with plugin
+        bSeqLen2SpatialPlugin = True
+        opt = Optimizer(onnx_graph, verbose=self.verbose)
+        opt.info(f'{class_name}: original')
+        if bRemoveInstanceNorm:
+            num_instancenorm_replaced = opt.decompose_instancenorms()
+            opt.info(f'{class_name}: replaced ' +
+                     str(num_instancenorm_replaced)+' InstanceNorms')
+        if bRemoveCastNode:
+            num_casts_removed = opt.remove_casts()
+            opt.info(f'{class_name}: removed '+str(num_casts_removed)+' casts')
+        if bRemoveParallelSwish:
+            num_parallel_swish_removed = opt.remove_parallel_swish()
+            opt.info(f'{class_name}: removed ' +
+                     str(num_parallel_swish_removed)+' parallel swish ops')
+        if bAdjustAddNode:
+            num_adjust_add = opt.adjustAddNode()
+            opt.info(f'{class_name}: adjusted '+str(num_adjust_add)+' adds')
+        if bResizeFix:
+            num_resize_fix = opt.resize_fix()
+            opt.info(f'{class_name}: fixed '+str(num_resize_fix)+' resizes')
+        opt.cleanup()
+        opt.info(f'{class_name}: cleanup')
+        opt.fold_constants()
+        opt.info(f'{class_name}: fold constants')
+        opt.infer_shapes()
+        opt.info(f'{class_name}: shape inference')
+        num_heads = 8
+        if bMHAPlugin and not bDisablePlugins:
+            num_fmha_inserted = opt.insert_fmha_plugin(num_heads)
+            opt.info(f'{class_name}: inserted '+str(num_fmha_inserted)+' fMHA plugins')
+        if bMHCAPlugin and not bDisablePlugins:
+            props = cudart.cudaGetDeviceProperties(0)[1]
+            sm = props.major * 10 + props.minor
+            num_fmhca_inserted = opt.insert_fmhca_plugin(num_heads, sm)
+            opt.info(f'{class_name}: inserted '+str(num_fmhca_inserted)+' fMHCA plugins')
+        if bGroupNormPlugin and not bDisablePlugins:
+            num_groupnorm_inserted = opt.insert_groupnorm_plugin()
+            opt.info(f'{class_name}: inserted '+str(num_groupnorm_inserted) +
+                     ' GroupNorm plugins')
+        if bLayerNormPlugin and not bDisablePlugins:
+            num_layernorm_inserted = opt.insert_layernorm_plugin()
+            opt.info(f'{class_name}: inserted '+str(num_layernorm_inserted) +
+                     ' LayerNorm plugins')
+        if bSplitGeLUPlugin and not bDisablePlugins:
+            num_splitgelu_inserted = opt.insert_splitgelu_plugin()
+            opt.info(f'{class_name}: inserted '+str(num_splitgelu_inserted) +
+                     ' SplitGeLU plugins')
+        if bSeqLen2SpatialPlugin and not bDisablePlugins:
+            num_seq2spatial_inserted = opt.insert_seq2spatial_plugin()
+            opt.info(f'{class_name}: inserted '+str(num_seq2spatial_inserted) +
+                     ' SeqLen2Spatial plugins')
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        opt.info(f'{class_name}: final')
+        return onnx_opt_graph
+class ControlNetModule(nn.Module):
+    def __init__(self, control_model_dir, fp16=True) -> None:
+        super().__init__()
+        self.device = 'cuda:0'
+        self.fp16 = fp16
+        model_opts = {'revision': 'fp16',
+                      'torch_dtype': torch.float16} if self.fp16 else {}
+        self.control = ControlNetModel.from_pretrained(
+            control_model_dir,
+            **model_opts
+        ).eval().to(self.device)
+    def forward(self, sample, timestep, encoder_hidden_states, controlnet_cond):
+        controlnet_conditioning_scale: float = 1.0
+        down_block_res_samples, mid_block_res_sample = self.control(
+            sample,
+            timestep,
+            encoder_hidden_states=encoder_hidden_states,
+            controlnet_cond=controlnet_cond,
+            return_dict=False,
+        )
+        down_block_res_samples = [
+            down_block_res_sample * controlnet_conditioning_scale
+            for down_block_res_sample in down_block_res_samples
+        ]
+        mid_block_res_sample *= controlnet_conditioning_scale
+        # @vane: currently, only retun mid_blocks_res_sample: (B, 1280, height//8//8, width//8//8)
+        # down_block_res_samples is a tensor tuple that length is 12.
+        # it will be flatten to 12 nodes if we return the down_block_res_samples
+        return mid_block_res_sample
+class ControlNet(FusedControlNet):
+    def __init__(self, local_model_path=None, controlnet_model_path=None, hf_token=None, text_maxlen=77,
+                 embedding_dim=768, fp16=False, device='cuda', verbose=True, max_batch_size=16):
+        super().__init__(local_model_path, controlnet_model_path, hf_token,
+                         text_maxlen, embedding_dim, fp16, device, verbose, max_batch_size)
+    def get_model(self):
+        model = ControlNetModule(
+            control_model_dir=self.controlnet_model_path,
+            fp16=self.fp16
+        )
+        return model
+    def get_input_names(self):
+        return ['sample', 'timestep', 'encoder_hidden_states', 'controlnet_cond']
+    def get_output_names(self):
+        return ['mids']
+    def get_dynamic_axes(self):
+        return {
+            'sample': {0: '2B', 2: '8H', 3: '8W'},
+            'encoder_hidden_states': {0: '2B'},
+            'controlnet_cond': {0: '2B', 2: '16H', 3: '16W'},
+            'mids': {0: '2B', 2: 'H', 3: 'W'}
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(
+            batch_size, image_height, image_width)
+        return {
+            'sample': (2*batch_size, 4, latent_height, latent_width),
+            'encoder_hidden_states': (2*batch_size, self.text_maxlen, self.embedding_dim),
+            'controlnet_cond': (2*batch_size, 3, image_height, image_width),
+            'mids': (2*batch_size, 1280, latent_height//8, latent_width//8)
+        }

lyraSD/muse_trt/sd_img2img.py ADDED Viewed

	@@ -0,0 +1,365 @@

+r"""
+StableDiffusion Img2Img Pipeline by TensorRT.
+It has included SuperResolutionX4 TensorRT Engine.
+Inspired by: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+            https://developer.nvidia.com/tensorrt
+"""
+import inspect
+import os
+from typing import List, Optional, Union
+import numpy as np
+import PIL.Image
+import tensorrt as trt
+import torch
+import time
+from diffusers import AutoencoderKL
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from diffusers.models.vae import DiagonalGaussianDistribution
+from diffusers.utils import PIL_INTERPOLATION, randn_tensor
+from polygraphy import cuda
+from transformers import CLIPTokenizer
+from .models import CLIP, UNet, VAEDecoder, VAEEncoder
+from .super import SuperX4TRTInfer
+from .utilities import TRT_LOGGER, Engine
+def preprocess(image):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+class TRTStableDiffusionImg2ImgPipeline:
+    def __init__(self, engine_dir: str, o_height: int = 1300, o_width: int = 750, device: str = 'cuda:0'):
+        self.device = torch.device(device)
+        super().__init__()
+        self.vae = AutoencoderKL.from_pretrained(
+            os.path.join(engine_dir, 'vae'),
+            torch_dtype=torch.float16
+        ).to(self.device)
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            os.path.join(engine_dir, 'tokenizer')
+        )
+        self.scheduler = DPMSolverMultistepScheduler.from_pretrained(
+            os.path.join(engine_dir, 'scheduler')
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.trt_torch_models_cls = {
+            'clip': CLIP(),
+            'unet_fp16': UNet(),
+            'vae-encoder': VAEEncoder(),
+            'vae-decoder': VAEDecoder()
+        }
+        self.engine = {}
+        # Build engines
+        for model_name, _ in self.trt_torch_models_cls.items():
+            engine = Engine(model_name, engine_dir)
+            self.engine[model_name] = engine
+        # Separate iteration to activate engines
+        for model_name, _ in self.trt_torch_models_cls.items():
+            self.engine[model_name].activate()
+        self.stream = cuda.Stream()
+        self.super = SuperX4TRTInfer(
+            engine_dir,
+            model_name='superx4.plan',
+            fp16=True,
+            o_height=o_height,
+            o_width=o_width
+        )
+    def runEngine(self, model_name, feed_dict):
+        engine = self.engine[model_name]
+        return engine.infer(feed_dict, self.stream)
+    def _torch_decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        image = (image * 255).round()
+        return image
+    def _trt_decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        sample_inp = cuda.DeviceView(
+            ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
+        image = self.runEngine('vae-decoder', {"latent": sample_inp})['images']
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        image = (image * 255).round()
+        return image
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+        return timesteps, num_inference_steps - t_start
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        image = image.to(device=device, dtype=dtype)
+        batch_size = batch_size * num_images_per_prompt
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if isinstance(generator, list):
+            init_latents = [
+                self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = self.vae.encode(image).latent_dist.sample(generator)
+        init_latents = self.vae.config.scaling_factor * init_latents
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+        return latents
+    def _default_height_width(self, height, width, image):
+        if isinstance(image, list):
+            image = image[0]
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[3]
+            height = (height // 8) * 8  # round down to nearest multiple of 8
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[2]
+            width = (width // 8) * 8  # round down to nearest multiple of 8
+        return height, width
+    def _trt_encode_prompt(self, prompt, negative_prompt, num_images_per_prompt,):
+        # Tokenize input
+        text_input_ids = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids.type(torch.int32).to(self.device)
+        # CLIP text encoder
+        text_input_ids_inp = cuda.DeviceView(
+            ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32
+        )
+        text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
+        # Duplicate text embeddings for each generation per prompt
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        max_length = text_input_ids.shape[-1]
+        uncond_input_ids = self.tokenizer(
+            negative_prompt,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.type(torch.int32).to(self.device)
+        uncond_input_ids_inp = cuda.DeviceView(
+            ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
+        uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
+        # Duplicate unconditional embeddings for each generation per prompt
+        seq_len = uncond_embeddings.shape[1]
+        uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+        uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        text_embeddings = text_embeddings.to(dtype=torch.float16)
+        return text_embeddings
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        use_super: bool = True,
+    ):
+        # 1. Default height and width to unet
+        height, width = self._default_height_width(height, width, image)
+        # 2. Define call parameters and Allocate the cuda buffers for TRT Engine bindings.
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # Allocate buffers for TensorRT engine bindings
+        for model_name, obj in self.trt_torch_models_cls.items():
+            self.engine[model_name].allocate_buffers(
+                shape_dict=obj.get_shape_dict(batch_size, height, width),
+                device=self.device
+            )
+        do_classifier_free_guidance = guidance_scale > 1.0
+        with trt.Runtime(TRT_LOGGER) as runtime:
+            torch.cuda.synchronize()
+            # 3. Encode input prompt. TRT Clip model.
+            prompt_embeds = self._trt_encode_prompt(
+                prompt, negative_prompt, num_images_per_prompt
+            )
+            # 4. Prepare mask, image, and controlnet_conditioning_image
+            image = preprocess(image)
+            # 5. Prepare timesteps.
+            self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, self.device)
+            latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+            # 6. Prepare latent variables. It will use VAE-Enoder(currently the encoder is torch model, not trt)
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                batch_size,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                self.device,
+                generator,
+            )
+            # 7. Prepare extra step kwargs and Set lantens/controlnet_conditioning_image/prompt_embeds to special dtype.
+            #   The dytpe must be equal to the following to ensure that the NAN can not be issued in trt engine.
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+            latents = latents.to(dtype=torch.float32)
+            prompt_embeds = prompt_embeds.to(dtype=torch.float16)
+            # 8. Denoising loop
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                dtype = np.float16
+                if t.dtype != torch.float32:
+                    timestep_float = t.float()
+                else:
+                    timestep_float = t
+                sample_inp = cuda.DeviceView(
+                    ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32
+                )
+                timestep_inp = cuda.DeviceView(
+                    ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32
+                )
+                embeddings_inp = cuda.DeviceView(
+                    ptr=prompt_embeds.data_ptr(), shape=prompt_embeds.shape, dtype=dtype
+                )
+                noise_pred = self.engine['unet_fp16'].infer(
+                    {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
+                    self.stream)['latent']
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            # 9. Use VAE-Decoder to decode the latents
+            image = self._trt_decode_latents(latents)
+        # 10. SuperX4 Resolution, Optional.
+        if use_super:
+            image = self.super.infer(np.transpose(image.astype(np.float16), (0, 3, 1, 2)))
+        return image

lyraSD/muse_trt/sd_text2img.py ADDED Viewed

	@@ -0,0 +1,290 @@

+r"""
+StableDiffusion Text2Img Pipeline by TensorRT.
+It has included SuperResolutionX4 TensorRT Engine.
+Inspired by: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+            https://developer.nvidia.com/tensorrt
+"""
+import inspect
+import os
+from typing import List, Optional, Union
+import numpy as np
+import tensorrt as trt
+import torch
+from diffusers import AutoencoderKL
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from diffusers.utils import PIL_INTERPOLATION, randn_tensor
+from polygraphy import cuda
+from transformers import CLIPTokenizer
+from .models import CLIP, UNet, VAEDecoder
+from .super import SuperX4TRTInfer
+from .utilities import TRT_LOGGER, Engine
+class TRTStableDiffusionText2ImgPipeline:
+    def __init__(self, engine_dir: str, o_height: int = 512, o_width: int = 512, device: str = 'cuda:0'):
+        self.device = torch.device(device)
+        super().__init__()
+        self.vae = AutoencoderKL.from_pretrained(
+            os.path.join(engine_dir, 'vae'),
+            torch_dtype=torch.float16
+        ).to(self.device)
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            os.path.join(engine_dir, 'tokenizer')
+        )
+        self.scheduler = DPMSolverMultistepScheduler.from_pretrained(
+            os.path.join(engine_dir, 'scheduler')
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.trt_torch_models_cls = {
+            'clip': CLIP(),
+            'unet_fp16': UNet(),
+            'vae-decoder': VAEDecoder()
+        }
+        self.engine = {}
+        # Build engines
+        for model_name, _ in self.trt_torch_models_cls.items():
+            engine = Engine(model_name, engine_dir)
+            self.engine[model_name] = engine
+        # Separate iteration to activate engines
+        for model_name, _ in self.trt_torch_models_cls.items():
+            self.engine[model_name].activate()
+        self.stream = cuda.Stream()
+        self.super = SuperX4TRTInfer(
+            engine_dir,
+            model_name='superx4.plan',
+            fp16=True,
+            o_height=o_height,
+            o_width=o_width
+        )
+    def runEngine(self, model_name, feed_dict):
+        engine = self.engine[model_name]
+        return engine.infer(feed_dict, self.stream)
+    def _torch_decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        image = (image * 255).round()
+        return image
+    def _trt_decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        sample_inp = cuda.DeviceView(
+            ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
+        image = self.runEngine('vae-decoder', {"latent": sample_inp})['images']
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        image = (image * 255).round()
+        return image
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+        return timesteps, num_inference_steps - t_start
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def _trt_encode_prompt(self, prompt, negative_prompt, num_images_per_prompt,):
+        # Tokenize input
+        text_input_ids = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids.type(torch.int32).to(self.device)
+        # CLIP text encoder
+        text_input_ids_inp = cuda.DeviceView(
+            ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32
+        )
+        text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
+        # Duplicate text embeddings for each generation per prompt
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        max_length = text_input_ids.shape[-1]
+        uncond_input_ids = self.tokenizer(
+            negative_prompt,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids.type(torch.int32).to(self.device)
+        uncond_input_ids_inp = cuda.DeviceView(
+            ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
+        uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
+        # Duplicate unconditional embeddings for each generation per prompt
+        seq_len = uncond_embeddings.shape[1]
+        uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+        uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        text_embeddings = text_embeddings.to(dtype=torch.float16)
+        return text_embeddings
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        use_super: bool = True,
+    ):
+        # 1. Default height and width to unet
+        assert height is not None, "height can not be None"
+        assert width is not None, "width can not be None"
+        # 2. Define call parameters and Allocate the cuda buffers for TRT Engine bindings.
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # Allocate buffers for TensorRT engine bindings
+        for model_name, obj in self.trt_torch_models_cls.items():
+            self.engine[model_name].allocate_buffers(
+                shape_dict=obj.get_shape_dict(batch_size, height, width),
+                device=self.device
+            )
+        do_classifier_free_guidance = guidance_scale > 1.0
+        with trt.Runtime(TRT_LOGGER) as runtime:
+            torch.cuda.synchronize()
+            # 3. Encode input prompt. TRT Clip model.
+            prompt_embeds = self._trt_encode_prompt(
+                prompt, negative_prompt, num_images_per_prompt
+            )
+            # 4. Prepare timesteps.
+            self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+            timesteps = self.scheduler.timesteps
+            # 5. Prepare latent variables. It will use VAE-Enoder(currently the encoder is torch model, not trt)
+            num_channels_latents = 4
+            latents = self.prepare_latents(
+                batch_size*num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                self.device,
+                generator,
+                latents
+            )
+            # 6. Prepare extra step kwargs and Set lantens/controlnet_conditioning_image/prompt_embeds to special dtype.
+            #   The dytpe must be equal to the following to ensure that the NAN can not be issued in trt engine.
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+            latents = latents.to(dtype=torch.float32)
+            prompt_embeds = prompt_embeds.to(dtype=torch.float16)
+            # 7. Denoising loop
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                dtype = np.float16
+                if t.dtype != torch.float32:
+                    timestep_float = t.float()
+                else:
+                    timestep_float = t
+                sample_inp = cuda.DeviceView(
+                    ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32
+                )
+                timestep_inp = cuda.DeviceView(
+                    ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32
+                )
+                embeddings_inp = cuda.DeviceView(
+                    ptr=prompt_embeds.data_ptr(), shape=prompt_embeds.shape, dtype=dtype
+                )
+                noise_pred = self.engine['unet_fp16'].infer(
+                    {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
+                    self.stream)['latent']
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            # 8. Use VAE-Decoder to decode the latents
+            image = self._trt_decode_latents(latents)
+        # 9. SuperX4 Resolution, Optional.
+        if use_super:
+            image = self.super.infer(np.transpose(image.astype(np.float16), (0, 3, 1, 2)))
+            image = np.transpose(image, (0,3,1,2))
+        return image

lyraSD/muse_trt/super.py ADDED Viewed

	@@ -0,0 +1,64 @@

+r"""use tensorrt engine to infer, a useful pipeline"""
+import os
+import numpy as np
+from polygraphy import cuda
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.trt import engine_from_bytes
+class SuperX4TRTInfer:
+    def __init__(self, engine_dir,
+                 model_name='superx4.plan',
+                 o_height=None,
+                 o_width=None,
+                 fp16=True,
+                 ) -> None:
+        engine_path = os.path.join(engine_dir, model_name)
+        self.engine = engine_from_bytes(bytes_from_path(engine_path))
+        self.context = self.engine.create_execution_context()
+        self.o_height = o_height
+        self.o_width = o_width
+        self.fp = fp16
+        self.dtype = np.float16 if fp16 else np.float32
+        self.stream = cuda.Stream()
+    def infer(self, x):
+        batch_size, channel, height, width = x.shape
+        if self.o_height is None or self.o_width is None:
+            o_height = height*4
+            o_width = width*4
+        else:
+            o_height = self.o_height
+            o_width = self.o_width
+        h_output = np.empty([batch_size, channel, o_height, o_width], dtype=self.dtype)
+        # allocate device memory
+        d_input = cuda.wrapper().malloc(1 * x.nbytes)
+        d_output = cuda.wrapper().malloc(1*h_output.nbytes)
+        bindings = [int(d_input), int(d_output)]
+        # transfer input data to device
+        cuda.wrapper().memcpy(d_input, x.ctypes.data, x.nbytes, cuda.MemcpyKind.HostToDevice, self.stream.ptr)
+        # execute model
+        noerror = self.context.execute_async_v2(bindings, self.stream.ptr)
+        if not noerror:
+            raise ValueError(f"ERROR: inference failed.")
+        # transfer predictions back
+        cuda.wrapper().memcpy(h_output.ctypes.data, d_output, h_output.nbytes, cuda.MemcpyKind.DeviceToHost, self.stream.ptr)
+        cuda.wrapper().free(d_input)
+        cuda.wrapper().free(d_output)
+        return h_output
+    def teardown(self):
+        del self.engine
+        self.stream.free()
+        del self.stream

lyraSD/muse_trt/utilities.py ADDED Viewed

	@@ -0,0 +1,536 @@

+r"""utils components"""
+from collections import OrderedDict
+from copy import copy
+import numpy as np
+import os
+import math
+from PIL import Image
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.trt import CreateConfig, Profile
+from polygraphy.backend.trt import engine_from_bytes, engine_from_network, network_from_onnx_path, save_engine
+from polygraphy.backend.trt import util as trt_util
+from polygraphy import cuda
+import random
+from scipy import integrate
+import tensorrt as trt
+import torch
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+class Engine():
+    def __init__(
+        self,
+        model_name,
+        engine_dir,
+        memory_pool_size=None
+    ):
+        self.engine_path = os.path.join(engine_dir, model_name+'.plan')
+        self.engine = None
+        self.context = None
+        self.buffers = OrderedDict()
+        self.tensors = OrderedDict()
+        self.memory_pool_size = memory_pool_size
+    def __del__(self):
+        [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
+        del self.engine
+        del self.context
+        del self.buffers
+        del self.tensors
+    def build(self, onnx_path, fp16, input_profile=None, enable_preview=False):
+        print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+        p = Profile()
+        if input_profile:
+            for name, dims in input_profile.items():
+                assert len(dims) == 3
+                p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+        preview_features = []
+        if enable_preview:
+            trt_version = [int(i) for i in trt.__version__.split(".")]
+            # FASTER_DYNAMIC_SHAPES_0805 should only be used for TRT 8.5.1 or above.
+            if trt_version[0] > 8 or \
+                    (trt_version[0] == 8 and (trt_version[1] > 5 or (trt_version[1] == 5 and trt_version[2] >= 1))):
+                preview_features = [trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
+        if self.memory_pool_size is not None:
+            memory_pool_limits = {trt.MemoryPoolType.WORKSPACE: (self.memory_pool_size*(2 ** 30))}
+            print(memory_pool_limits)
+        else:
+            memory_pool_limits = None
+        engine = engine_from_network(
+            network_from_onnx_path(onnx_path),
+            config=CreateConfig(
+                fp16=fp16, profiles=[p], preview_features=preview_features, memory_pool_limits=memory_pool_limits
+            )
+        )
+        save_engine(engine, path=self.engine_path)
+    def activate(self):
+        print(f"Loading TensorRT engine: {self.engine_path}")
+        self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
+        self.context = self.engine.create_execution_context()
+    def allocate_buffers(self, shape_dict=None, device='cuda'):
+        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
+            binding = self.engine[idx]
+            if shape_dict and binding in shape_dict:
+                shape = shape_dict[binding]
+            else:
+                shape = self.engine.get_binding_shape(binding)
+            dtype = trt_util.np_dtype_from_trt(self.engine.get_binding_dtype(binding))
+            if self.engine.binding_is_input(binding):
+                self.context.set_binding_shape(idx, shape)
+            # Workaround to convert np dtype to torch
+            np_type_tensor = np.empty(shape=[], dtype=dtype)
+            torch_type_tensor = torch.from_numpy(np_type_tensor)
+            tensor = torch.empty(tuple(shape), dtype=torch_type_tensor.dtype).to(device=device)
+            self.tensors[binding] = tensor
+            self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)
+    def infer(self, feed_dict, stream):
+        start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
+        # shallow copy of ordered dict
+        device_buffers = copy(self.buffers)
+        for name, buf in feed_dict.items():
+            assert isinstance(buf, cuda.DeviceView)
+            device_buffers[name] = buf
+        bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
+        noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
+        if not noerror:
+            raise ValueError(f"ERROR: inference failed.")
+        return self.tensors
+class LMSDiscreteScheduler():
+    def __init__(
+        self,
+        device='cuda',
+        beta_start=0.00085,
+        beta_end=0.012,
+        num_train_timesteps=1000,
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        self.order = 4
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        betas = (torch.linspace(beta_start**0.5, beta_end**0.5, self.num_train_timesteps, dtype=torch.float32) ** 2)
+        alphas = 1.0 - betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = self.sigmas.max()
+        self.device = device
+    def set_timesteps(self, steps):
+        self.num_inference_steps = steps
+        timesteps = np.linspace(0, self.num_train_timesteps - 1, steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas).to(device=self.device)
+        # Move all timesteps to correct device beforehand
+        self.timesteps = torch.from_numpy(timesteps).to(device=self.device).float()
+        self.derivatives = []
+    def scale_model_input(self, sample: torch.FloatTensor, idx, *args, **kwargs) -> torch.FloatTensor:
+        return sample * self.latent_scales[idx]
+    def configure(self):
+        order = self.order
+        self.lms_coeffs = []
+        self.latent_scales = [1./((sigma**2 + 1) ** 0.5) for sigma in self.sigmas]
+        def get_lms_coefficient(order, t, current_order):
+            """
+            Compute a linear multistep coefficient.
+            """
+            def lms_derivative(tau):
+                prod = 1.0
+                for k in range(order):
+                    if current_order == k:
+                        continue
+                    prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
+                return prod
+            integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+            return integrated_coeff
+        for step_index in range(self.num_inference_steps):
+            order = min(step_index + 1, order)
+            self.lms_coeffs.append([get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)])
+    def step(self, output, latents, idx, timestep):
+        # compute the previous noisy sample x_t -> x_t-1
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        sigma = self.sigmas[idx]
+        pred_original_sample = latents - sigma * output
+        # 2. Convert to an ODE derivative
+        derivative = (latents - pred_original_sample) / sigma
+        self.derivatives.append(derivative)
+        if len(self.derivatives) > self.order:
+            self.derivatives.pop(0)
+        # 3. Compute previous sample based on the derivatives path
+        prev_sample = latents + sum(
+            coeff * derivative for coeff, derivative in zip(self.lms_coeffs[idx], reversed(self.derivatives))
+        )
+        return prev_sample
+class DPMScheduler():
+    def __init__(
+        self,
+        beta_start=0.00085,
+        beta_end=0.012,
+        num_train_timesteps=1000,
+        solver_order=2,
+        predict_epsilon=True,
+        thresholding=False,
+        dynamic_thresholding_ratio=0.995,
+        sample_max_value=1.0,
+        algorithm_type="dpmsolver++",
+        solver_type="midpoint",
+        lower_order_final=True,
+        device='cuda',
+    ):
+        # this schedule is very specific to the latent diffusion model.
+        self.betas = (
+            torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        )
+        self.device = device
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        self.algorithm_type = algorithm_type
+        self.predict_epsilon = predict_epsilon
+        self.thresholding = thresholding
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.sample_max_value = sample_max_value
+        self.lower_order_final = lower_order_final
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+        if solver_type not in ["midpoint", "heun"]:
+            raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+        # setable values
+        self.num_inference_steps = None
+        self.solver_order = solver_order
+        self.num_train_timesteps = num_train_timesteps
+        self.solver_type = solver_type
+        self.first_order_first_coef = []
+        self.first_order_second_coef = []
+        self.second_order_first_coef = []
+        self.second_order_second_coef = []
+        self.second_order_third_coef = []
+        self.third_order_first_coef = []
+        self.third_order_second_coef = []
+        self.third_order_third_coef = []
+        self.third_order_fourth_coef = []
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        return sample
+    def configure(self):
+        lower_order_nums = 0
+        for step_index in range(self.num_inference_steps):
+            step_idx = step_index
+            timestep = self.timesteps[step_idx]
+            prev_timestep = 0 if step_idx == len(self.timesteps) - 1 else self.timesteps[step_idx + 1]
+            self.dpm_solver_first_order_coefs_precompute(timestep, prev_timestep)
+            timestep_list = [self.timesteps[step_index - 1], timestep]
+            self.multistep_dpm_solver_second_order_coefs_precompute(timestep_list, prev_timestep)
+            timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
+            self.multistep_dpm_solver_third_order_coefs_precompute(timestep_list, prev_timestep)
+            if lower_order_nums < self.solver_order:
+                lower_order_nums += 1
+    def dpm_solver_first_order_coefs_precompute(self, timestep, prev_timestep):
+        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
+        alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
+        sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
+        h = lambda_t - lambda_s
+        if self.algorithm_type == "dpmsolver++":
+            self.first_order_first_coef.append(sigma_t / sigma_s)
+            self.first_order_second_coef.append(alpha_t * (torch.exp(-h) - 1.0))
+        elif self.algorithm_type == "dpmsolver":
+            self.first_order_first_coef.append(alpha_t / alpha_s)
+            self.first_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0))
+    def multistep_dpm_solver_second_order_coefs_precompute(self, timestep_list, prev_timestep):
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h = lambda_t - lambda_s0
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.solver_type == "midpoint":
+                self.second_order_first_coef.append(sigma_t / sigma_s0)
+                self.second_order_second_coef.append((alpha_t * (torch.exp(-h) - 1.0)))
+                self.second_order_third_coef.append(0.5 * (alpha_t * (torch.exp(-h) - 1.0)))
+            elif self.solver_type == "heun":
+                self.second_order_first_coef.append(sigma_t / sigma_s0)
+                self.second_order_second_coef.append((alpha_t * (torch.exp(-h) - 1.0)))
+                self.second_order_third_coef.append(alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0))
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.solver_type == "midpoint":
+                self.second_order_first_coef.append(alpha_t / alpha_s0)
+                self.second_order_second_coef.append((sigma_t * (torch.exp(h) - 1.0)))
+                self.second_order_third_coef.append(0.5 * (sigma_t * (torch.exp(h) - 1.0)))
+            elif self.solver_type == "heun":
+                self.second_order_first_coef.append(alpha_t / alpha_s0)
+                self.second_order_second_coef.append((sigma_t * (torch.exp(h) - 1.0)))
+                self.second_order_third_coef.append((sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)))
+    def multistep_dpm_solver_third_order_coefs_precompute(self, timestep_list, prev_timestep):
+        t, s0 = prev_timestep, timestep_list[-1]
+        lambda_t, lambda_s0 = (
+            self.lambda_t[t],
+            self.lambda_t[s0]
+        )
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h = lambda_t - lambda_s0
+        if self.algorithm_type == "dpmsolver++":
+            self.third_order_first_coef.append(sigma_t / sigma_s0)
+            self.third_order_second_coef.append(alpha_t * (torch.exp(-h) - 1.0))
+            self.third_order_third_coef.append(alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0))
+            self.third_order_fourth_coef.append(alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5))
+        elif self.algorithm_type == "dpmsolver":
+            self.third_order_first_coef.append(alpha_t / alpha_s0)
+            self.third_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0))
+            self.third_order_third_coef.append(sigma_t * ((torch.exp(h) - 1.0) / h - 1.0))
+            self.third_order_fourth_coef.append(sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5))
+    def set_timesteps(self, num_inference_steps):
+        self.num_inference_steps = num_inference_steps
+        timesteps = (
+            np.linspace(0, self.num_train_timesteps - 1, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .copy()
+            .astype(np.int32)
+        )
+        self.timesteps = torch.from_numpy(timesteps).to(self.device)
+        self.model_outputs = [
+            None,
+        ] * self.solver_order
+        self.lower_order_nums = 0
+    def convert_model_output(
+        self, model_output, timestep, sample
+    ):
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.algorithm_type == "dpmsolver++":
+            if self.predict_epsilon:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            else:
+                x0_pred = model_output
+            if self.thresholding:
+                # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+                dynamic_max_val = torch.quantile(
+                    torch.abs(x0_pred).reshape((x0_pred.shape[0], -1)), self.dynamic_thresholding_ratio, dim=1
+                )
+                dynamic_max_val = torch.maximum(
+                    dynamic_max_val,
+                    self.sample_max_value * torch.ones_like(dynamic_max_val).to(dynamic_max_val.device),
+                )[(...,) + (None,) * (x0_pred.ndim - 1)]
+                x0_pred = torch.clamp(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
+            return x0_pred
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.algorithm_type == "dpmsolver":
+            if self.predict_epsilon:
+                return model_output
+            else:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+    def dpm_solver_first_order_update(
+        self,
+        idx,
+        model_output,
+        sample
+    ):
+        first_coef = self.first_order_first_coef[idx]
+        second_coef = self.first_order_second_coef[idx]
+        if self.algorithm_type == "dpmsolver++":
+            x_t = first_coef * sample - second_coef * model_output
+        elif self.algorithm_type == "dpmsolver":
+            x_t = first_coef * sample - second_coef * model_output
+        return x_t
+    def multistep_dpm_solver_second_order_update(
+        self,
+        idx,
+        model_output_list,
+        timestep_list,
+        prev_timestep,
+        sample
+    ):
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        first_coef = self.second_order_first_coef[idx]
+        second_coef = self.second_order_second_coef[idx]
+        third_coef = self.second_order_third_coef[idx]
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    first_coef * sample
+                    - second_coef * D0
+                    - third_coef * D1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    first_coef * sample
+                    - second_coef * D0
+                    + third_coef * D1
+                )
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    first_coef * sample
+                    - second_coef * D0
+                    - third_coef * D1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    first_coef * sample
+                    - second_coef * D0
+                    - third_coef * D1
+                )
+        return x_t
+    def multistep_dpm_solver_third_order_update(
+        self,
+        idx,
+        model_output_list,
+        timestep_list,
+        prev_timestep,
+        sample
+    ):
+        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+            self.lambda_t[t],
+            self.lambda_t[s0],
+            self.lambda_t[s1],
+            self.lambda_t[s2],
+        )
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        first_coef = self.third_order_first_coef[idx]
+        second_coef = self.third_order_second_coef[idx]
+        third_coef = self.third_order_third_coef[idx]
+        fourth_coef = self.third_order_fourth_coef[idx]
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                first_coef * sample
+                - second_coef * D0
+                + third_coef * D1
+                - fourth_coef * D2
+            )
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                first_coef * sample
+                - second_coef * D0
+                - third_coef * D1
+                - fourth_coef * D2
+            )
+        return x_t
+    def step(self, output, latents, step_index, timestep):
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+        lower_order_final = (
+            (step_index == len(self.timesteps) - 1) and self.lower_order_final and len(self.timesteps) < 15
+        )
+        lower_order_second = (
+            (step_index == len(self.timesteps) - 2) and self.lower_order_final and len(self.timesteps) < 15
+        )
+        output = self.convert_model_output(output, timestep, latents)
+        for i in range(self.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = output
+        if self.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(step_index, output, latents)
+        elif self.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            timestep_list = [self.timesteps[step_index - 1], timestep]
+            prev_sample = self.multistep_dpm_solver_second_order_update(
+                step_index, self.model_outputs, timestep_list, prev_timestep, latents
+            )
+        else:
+            timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
+            prev_sample = self.multistep_dpm_solver_third_order_update(
+                step_index, self.model_outputs, timestep_list, prev_timestep, latents
+            )
+        if self.lower_order_nums < self.solver_order:
+            self.lower_order_nums += 1
+        return prev_sample
+def save_image(images, image_path_dir, image_name_prefix):
+    """
+    Save the generated images to png files.
+    """
+    images = ((images + 1) * 255 / 2).clamp(0, 255).detach().permute(0, 2, 3, 1).round().type(torch.uint8).cpu().numpy()
+    for i in range(images.shape[0]):
+        image_path = os.path.join(image_path_dir, image_name_prefix+str(i+1)+'-'+str(random.randint(1000, 9999))+'.png')
+        print(f"Saving image {i+1} / {images.shape[0]} to: {image_path}")
+        Image.fromarray(images[i]).save(image_path)