ai-lab-pipeline-video-image

Runtime error

App Files Files Community

kadirnar commited on Mar 14, 2023

Commit

8bf6bdc

1 Parent(s): bc13541

add controlnet inpaint model

Browse files

Files changed (5) hide show

app.py +6 -1
diffusion_webui/controlnet_inpaint/__init__.py +0 -0
diffusion_webui/controlnet_inpaint/controlnet_inpaint_app.py +163 -0
diffusion_webui/controlnet_inpaint/pipeline_stable_diffusion_controlnet_inpaint.py +607 -0
diffusion_webui/helpers.py +15 -0

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from diffusion_webui.helpers import (
     stable_diffusion_controlnet_seg_app,
     stable_diffusion_img2img_app,
     stable_diffusion_inpaint_app,
     stable_diffusion_text2img_app,
 )
@@ -56,7 +57,11 @@ with app:
                 with gr.Tab("Scribble"):
                     stable_diffusion_controlnet_scribble_app()
             with gr.Tab("Keras Diffusion"):
                 keras_diffusion_app = keras_stable_diffusion_app()
-app.launch(debug=True)

     stable_diffusion_controlnet_seg_app,
     stable_diffusion_img2img_app,
     stable_diffusion_inpaint_app,
+    stable_diffusion_inpiant_controlnet_canny_app,
     stable_diffusion_text2img_app,
 )
                 with gr.Tab("Scribble"):
                     stable_diffusion_controlnet_scribble_app()
+            with gr.Tab("ControlNet Inpaint"):
+                with gr.Tab("Inpaint Canny"):
+                    stable_diffusion_inpiant_controlnet_canny_app()
             with gr.Tab("Keras Diffusion"):
                 keras_diffusion_app = keras_stable_diffusion_app()
+app.launch(debug=True, enable_queue=True)

diffusion_webui/controlnet_inpaint/__init__.py ADDED Viewed

File without changes

diffusion_webui/controlnet_inpaint/controlnet_inpaint_app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import gradio as gr
+import numpy as np
+import torch
+from diffusers import UniPCMultistepScheduler
+from PIL import Image
+from diffusion_webui.controlnet.controlnet_canny import controlnet_canny
+from diffusion_webui.controlnet_inpaint.pipeline_stable_diffusion_controlnet_inpaint import (
+    StableDiffusionControlNetInpaintPipeline,
+)
+stable_inpaint_model_list = [
+    "stabilityai/stable-diffusion-2-inpainting",
+    "runwayml/stable-diffusion-inpainting",
+]
+controlnet_model_list = [
+    "lllyasviel/sd-controlnet-canny",
+]
+prompt_list = [
+    "a red panda sitting on a bench",
+]
+negative_prompt_list = [
+    "bad, ugly",
+]
+def load_img(image_path: str):
+    image = Image.open(image_path)
+    image = np.array(image)
+    image = Image.fromarray(image)
+    return image
+def stable_diffusion_inpiant_controlnet_canny(
+    normal_image_path: str,
+    stable_model_path: str,
+    controlnet_model_path: str,
+    prompt: str,
+    negative_prompt: str,
+    controlnet_conditioning_scale: str,
+    guidance_scale: int,
+    num_inference_steps: int,
+):
+    pil_image = Image.open(normal_image_path)
+    normal_image = pil_image["image"].convert("RGB").resize((512, 512))
+    mask_image = pil_image["mask"].convert("RGB").resize((512, 512))
+    # normal_image = load_img(normal_image_path)
+    # mask_image = load_img(mask_image_path)
+    controlnet, control_image = controlnet_canny(
+        image_path=normal_image_path,
+        controlnet_model_path=controlnet_model_path,
+    )
+    pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+        pretrained_model_name_or_path=stable_model_path,
+        controlnet=controlnet,
+        torch_dtype=torch.float16,
+    )
+    pipe.to("cuda")
+    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+    pipe.enable_xformers_memory_efficient_attention()
+    generator = torch.manual_seed(0)
+    output = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+        image=normal_image,
+        control_image=control_image,
+        controlnet_conditioning_scale=controlnet_conditioning_scale,
+        guidance_scale=guidance_scale,
+        mask_image=mask_image,
+    ).images
+    return output[0]
+def stable_diffusion_inpiant_controlnet_canny_app():
+    with gr.Blocks():
+        with gr.Row():
+            with gr.Column():
+                inpaint_image_file = gr.Image(
+                    source="upload",
+                    tool="sketch",
+                    elem_id="image_upload",
+                    type="filepath",
+                    label="Upload",
+                )
+                inpaint_model_id = gr.Dropdown(
+                    choices=stable_inpaint_model_list,
+                    value=stable_inpaint_model_list[0],
+                    label="Inpaint Model Id",
+                )
+                inpaint_controlnet_model_id = gr.Dropdown(
+                    choices=controlnet_model_list,
+                    value=controlnet_model_list[0],
+                    label="ControlNet Model Id",
+                )
+                inpaint_prompt = gr.Textbox(
+                    lines=1, value=prompt_list[0], label="Prompt"
+                )
+                inpaint_negative_prompt = gr.Textbox(
+                    lines=1,
+                    value=negative_prompt_list[0],
+                    label="Negative Prompt",
+                )
+                with gr.Accordion("Advanced Options", open=False):
+                    controlnet_conditioning_scale = gr.Slider(
+                        minimum=0.1,
+                        maximum=1,
+                        step=0.1,
+                        value=0.5,
+                        label="ControlNet Conditioning Scale",
+                    )
+                    inpaint_guidance_scale = gr.Slider(
+                        minimum=0.1,
+                        maximum=15,
+                        step=0.1,
+                        value=7.5,
+                        label="Guidance Scale",
+                    )
+                    inpaint_num_inference_step = gr.Slider(
+                        minimum=1,
+                        maximum=100,
+                        step=1,
+                        value=50,
+                        label="Num Inference Step",
+                    )
+                inpaint_predict = gr.Button(value="Generator")
+            with gr.Column():
+                output_image = gr.Image(label="Outputs")
+        inpaint_predict.click(
+            fn=stable_diffusion_inpiant_controlnet_canny,
+            inputs=[
+                inpaint_image_file,
+                inpaint_model_id,
+                inpaint_controlnet_model_id,
+                inpaint_prompt,
+                inpaint_negative_prompt,
+                controlnet_conditioning_scale,
+                inpaint_guidance_scale,
+                inpaint_num_inference_step,
+            ],
+            outputs=output_image,
+        )

diffusion_webui/controlnet_inpaint/pipeline_stable_diffusion_controlnet_inpaint.py ADDED Viewed

	@@ -0,0 +1,607 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import PIL.Image
+import torch
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import *
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+        >>> import cv2
+        >>> from PIL import Image
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        ... )
+        >>> image = np.array(image)
+        >>> mask_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        ... )
+        >>> mask_image = np.array(mask_image)
+        >>> # get canny image
+        >>> canny_image = cv2.Canny(image, 100, 200)
+        >>> canny_image = canny_image[:, :, None]
+        >>> canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
+        >>> canny_image = Image.fromarray(canny_image)
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+        >>> pipe.enable_model_cpu_offload()
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking doggo",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     image=image,
+        ...     control_image=canny_image,
+        ...     mask_image=mask_image
+        ... ).images[0]
+        ```
+"""
+def prepare_mask_and_masked_image(image, mask):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(
+                f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not"
+            )
+        # Batch single image
+        if image.ndim == 3:
+            assert (
+                image.shape[0] == 3
+            ), "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+        assert (
+            image.ndim == 4 and mask.ndim == 4
+        ), "Image and Mask must have 4 dimensions"
+        assert (
+            image.shape[-2:] == mask.shape[-2:]
+        ), "Image and Mask must have the same spatial dimensions"
+        assert (
+            image.shape[0] == mask.shape[0]
+        ), "Image and Mask must have the same batch size"
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(
+            f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not"
+        )
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = np.concatenate(
+                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
+            )
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+    masked_image = image * (mask < 0.5)
+    return mask, masked_image
+class StableDiffusionControlNetInpaintPipeline(
+    StableDiffusionControlNetPipeline
+):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion with ControlNet guidance.
+    This model inherits from [`StableDiffusionControlNetPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`]):
+            Provides additional conditioning to the unet during the denoising process
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        do_classifier_free_guidance,
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask,
+            size=(
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor,
+            ),
+        )
+        mask = mask.to(device=device, dtype=dtype)
+        masked_image = masked_image.to(device=device, dtype=dtype)
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            masked_image_latents = [
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(
+                    generator=generator[i]
+                )
+                for i in range(batch_size)
+            ]
+            masked_image_latents = torch.cat(masked_image_latents, dim=0)
+        else:
+            masked_image_latents = self.vae.encode(
+                masked_image
+            ).latent_dist.sample(generator=generator)
+        masked_image_latents = (
+            self.vae.config.scaling_factor * masked_image_latents
+        )
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(
+                batch_size // masked_image_latents.shape[0], 1, 1, 1
+            )
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2)
+            if do_classifier_free_guidance
+            else masked_image_latents
+        )
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(
+            device=device, dtype=dtype
+        )
+        return mask, masked_image_latents
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        control_image: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+        ] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[
+            Union[torch.Generator, List[torch.Generator]]
+        ] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[
+            Callable[[int, int, torch.FloatTensor], None]
+        ] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                also be accepted as an image. The control image is automatically resized to fit the output image.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, control_image)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            control_image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare image
+        control_image = self.prepare_image(
+            control_image,
+            width,
+            height,
+            batch_size * num_images_per_prompt,
+            num_images_per_prompt,
+            device,
+            self.controlnet.dtype,
+        )
+        if do_classifier_free_guidance:
+            control_image = torch.cat([control_image] * 2)
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 6. Prepare latent variables
+        num_channels_latents = self.controlnet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # EXTRA: prepare mask latents
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Denoising loop
+        num_warmup_steps = (
+            len(timesteps) - num_inference_steps * self.scheduler.order
+        )
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=control_image,
+                    return_dict=False,
+                )
+                down_block_res_samples = [
+                    down_block_res_sample * controlnet_conditioning_scale
+                    for down_block_res_sample in down_block_res_samples
+                ]
+                mid_block_res_sample *= controlnet_conditioning_scale
+                # predict the noise residual
+                latent_model_input = torch.cat(
+                    [latent_model_input, mask, masked_image_latents], dim=1
+                )
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps
+                    and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if (
+            hasattr(self, "final_offload_hook")
+            and self.final_offload_hook is not None
+        ):
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        # Offload last model to CPU
+        if (
+            hasattr(self, "final_offload_hook")
+            and self.final_offload_hook is not None
+        ):
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

diffusion_webui/helpers.py CHANGED Viewed

@@ -1,33 +1,48 @@
 from diffusion_webui.controlnet.controlnet_canny import (
     stable_diffusion_controlnet_canny_app,
 )
 from diffusion_webui.controlnet.controlnet_depth import (
     stable_diffusion_controlnet_depth_app,
 )
 from diffusion_webui.controlnet.controlnet_hed import (
     stable_diffusion_controlnet_hed_app,
 )
 from diffusion_webui.controlnet.controlnet_mlsd import (
     stable_diffusion_controlnet_mlsd_app,
 )
 from diffusion_webui.controlnet.controlnet_pose import (
     stable_diffusion_controlnet_pose_app,
 )
 from diffusion_webui.controlnet.controlnet_scribble import (
     stable_diffusion_controlnet_scribble_app,
 )
 from diffusion_webui.controlnet.controlnet_seg import (
     stable_diffusion_controlnet_seg_app,
 )
 from diffusion_webui.stable_diffusion.img2img_app import (
     stable_diffusion_img2img_app,
 )
 from diffusion_webui.stable_diffusion.inpaint_app import (
     stable_diffusion_inpaint_app,
 )
 from diffusion_webui.stable_diffusion.keras_txt2img import (
     keras_stable_diffusion_app,
 )
 from diffusion_webui.stable_diffusion.text2img_app import (
     stable_diffusion_text2img_app,
 )

 from diffusion_webui.controlnet.controlnet_canny import (
+    stable_diffusion_controlnet_canny,
     stable_diffusion_controlnet_canny_app,
 )
 from diffusion_webui.controlnet.controlnet_depth import (
+    stable_diffusion_controlnet_depth,
     stable_diffusion_controlnet_depth_app,
 )
 from diffusion_webui.controlnet.controlnet_hed import (
+    stable_diffusion_controlnet_hed,
     stable_diffusion_controlnet_hed_app,
 )
 from diffusion_webui.controlnet.controlnet_mlsd import (
+    stable_diffusion_controlnet_mlsd,
     stable_diffusion_controlnet_mlsd_app,
 )
 from diffusion_webui.controlnet.controlnet_pose import (
+    stable_diffusion_controlnet_pose,
     stable_diffusion_controlnet_pose_app,
 )
 from diffusion_webui.controlnet.controlnet_scribble import (
+    stable_diffusion_controlnet_scribble,
     stable_diffusion_controlnet_scribble_app,
 )
 from diffusion_webui.controlnet.controlnet_seg import (
+    stable_diffusion_controlnet_seg,
     stable_diffusion_controlnet_seg_app,
 )
+from diffusion_webui.controlnet_inpaint.controlnet_inpaint_app import (
+    stable_diffusion_inpiant_controlnet_canny,
+    stable_diffusion_inpiant_controlnet_canny_app,
+)
 from diffusion_webui.stable_diffusion.img2img_app import (
+    stable_diffusion_img2img,
     stable_diffusion_img2img_app,
 )
 from diffusion_webui.stable_diffusion.inpaint_app import (
+    stable_diffusion_inpaint,
     stable_diffusion_inpaint_app,
 )
 from diffusion_webui.stable_diffusion.keras_txt2img import (
+    keras_stable_diffusion,
     keras_stable_diffusion_app,
 )
 from diffusion_webui.stable_diffusion.text2img_app import (
+    stable_diffusion_text2img,
     stable_diffusion_text2img_app,
 )