Spaces:

Zafaflahfksdf
/

_

Runtime error

App Files Files Community

Zafaflahfksdf commited on Aug 7, 2024

Commit

da3eeba

verified ·

1 Parent(s): ca1233a

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +142 -0
LICENSE +201 -0
README.md +149 -12
README_DEV.md +61 -0
fast_sam/__init__.py +9 -0
fast_sam/fast_sam_wrapper.py +90 -0
ia_check_versions.py +74 -0
ia_config.py +115 -0
ia_devices.py +10 -0
ia_file_manager.py +71 -0
ia_get_dataset_colormap.py +416 -0
ia_logging.py +14 -0
ia_sam_manager.py +182 -0
ia_threading.py +55 -0
ia_ui_gradio.py +30 -0
ia_ui_items.py +110 -0
iasam_app.py +809 -0
images/inpaint_anything_explanation_image_1.png +0 -0
images/inpaint_anything_ui_image_1.png +0 -0
images/sample_input_image.png +0 -0
images/sample_mask_image.png +0 -0
images/sample_seg_color_image.png +0 -0
inpalib/__init__.py +18 -0
inpalib/masklib.py +106 -0
inpalib/samlib.py +256 -0
javascript/inpaint-anything.js +458 -0
lama_cleaner/__init__.py +19 -0
lama_cleaner/benchmark.py +109 -0
lama_cleaner/const.py +173 -0
lama_cleaner/file_manager/__init__.py +1 -0
lama_cleaner/file_manager/file_manager.py +265 -0
lama_cleaner/file_manager/storage_backends.py +46 -0
lama_cleaner/file_manager/utils.py +67 -0
lama_cleaner/helper.py +292 -0
lama_cleaner/installer.py +12 -0
lama_cleaner/model/__init__.py +0 -0
lama_cleaner/model/base.py +298 -0
lama_cleaner/model/controlnet.py +289 -0
lama_cleaner/model/ddim_sampler.py +193 -0
lama_cleaner/model/fcf.py +1733 -0
lama_cleaner/model/instruct_pix2pix.py +83 -0
lama_cleaner/model/lama.py +51 -0
lama_cleaner/model/ldm.py +333 -0
lama_cleaner/model/manga.py +91 -0
lama_cleaner/model/mat.py +1935 -0
lama_cleaner/model/opencv2.py +28 -0
lama_cleaner/model/paint_by_example.py +79 -0
lama_cleaner/model/pipeline/__init__.py +3 -0
lama_cleaner/model/pipeline/pipeline_stable_diffusion_controlnet_inpaint.py +585 -0
lama_cleaner/model/plms_sampler.py +225 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,142 @@

+*.pth
+*.pt
+*.pyc
+src/
+outputs/
+models/
+models
+.DS_Store
+ia_config.ini
+.eslintrc
+.eslintrc.json
+pyproject.toml
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,149 @@
----
-title: ' '
-emoji: 🚀
-colorFrom: pink
-colorTo: blue
-sdk: gradio
-sdk_version: 4.40.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: _
+app_file: iasam_app.py
+sdk: gradio
+sdk_version: 3.50.2
+---
+# Inpaint Anything (Inpainting with Segment Anything)
+Inpaint Anything performs stable diffusion inpainting on a browser UI using any mask selected from the output of [Segment Anything](https://github.com/facebookresearch/segment-anything).
+Using Segment Anything enables users to specify masks by simply pointing to the desired areas, instead of manually filling them in. This can increase the efficiency and accuracy of the mask creation process, leading to potentially higher-quality inpainting results while saving time and effort.
+[Extension version for AUTOMATIC1111's Web UI](https://github.com/Uminosachi/sd-webui-inpaint-anything)
+![Explanation image](images/inpaint_anything_explanation_image_1.png)
+## Installation
+Please follow these steps to install the software:
+* Create a new conda environment:
+```bash
+conda create -n inpaint python=3.10
+conda activate inpaint
+```
+* Clone the software repository:
+```bash
+git clone https://github.com/Uminosachi/inpaint-anything.git
+cd inpaint-anything
+```
+* For the CUDA environment, install the following packages:
+```bash
+pip install -r requirements.txt
+```
+* If you are using macOS, please install the package from the following file instead:
+```bash
+pip install -r requirements_mac.txt
+```
+## Running the application
+```bash
+python iasam_app.py
+```
+* Open http://127.0.0.1:7860/ in your browser.
+* Note: If you have a privacy protection extension enabled in your web browser, such as DuckDuckGo, you may not be able to retrieve the mask from your sketch.
+### Options
+* `--save-seg`: Save the segmentation image generated by SAM.
+* `--offline`: Execute inpainting using an offline network.
+* `--sam-cpu`: Perform the Segment Anything operation on CPU.
+## Downloading the Model
+* Launch this application.
+* Click on the `Download model` button, located next to the [Segment Anything Model ID](https://github.com/facebookresearch/segment-anything#model-checkpoints). This includes the [SAM 2](https://github.com/facebookresearch/segment-anything-2), [Segment Anything in High Quality Model ID](https://github.com/SysCV/sam-hq), [Fast Segment Anything](https://github.com/CASIA-IVA-Lab/FastSAM), and [Faster Segment Anything (MobileSAM)](https://github.com/ChaoningZhang/MobileSAM).
+  * Please note that the SAM is available in three sizes: Base, Large, and Huge. Remember, larger sizes consume more VRAM.
+* Wait for the download to complete.
+* The downloaded model file will be stored in the `models` directory of this application's repository.
+## Usage
+* Drag and drop your image onto the input image area.
+  * Outpainting can be achieved by the `Padding options`, configuring the scale and balance, and then clicking on the `Run Padding` button.
+  * The `Anime Style` checkbox enhances segmentation mask detection, particularly in anime style images, at the expense of a slight reduction in mask quality.
+* Click on the `Run Segment Anything` button.
+* Use sketching to point the area you want to inpaint. You can undo and adjust the pen size.
+  * Hover over either the SAM image or the mask image and press the `S` key for Fullscreen mode, or the `R` key to Reset zoom.
+* Click on the `Create mask` button. The mask will appear in the selected mask image area.
+### Mask Adjustment
+* `Expand mask region` button: Use this to slightly expand the area of the mask for broader coverage.
+* `Trim mask by sketch` button: Clicking this will exclude the sketched area from the mask.
+* `Add mask by sketch` button: Clicking this will add the sketched area to the mask.
+### Inpainting Tab
+* Enter your desired Prompt and Negative Prompt, then choose the Inpainting Model ID.
+* Click on the `Run Inpainting` button (**Please note that it may take some time to download the model for the first time**).
+  * In the Advanced options, you can adjust the Sampler, Sampling Steps, Guidance Scale, and Seed.
+  * If you enable the `Mask area Only` option, modifications will be confined to the designated mask area only.
+* Adjust the iteration slider to perform inpainting multiple times with different seeds.
+* The inpainting process is powered by [diffusers](https://github.com/huggingface/diffusers).
+#### Tips
+* You can directly drag and drop the inpainted image into the input image field on the Web UI. (useful with Chrome and Edge browsers)
+#### Model Cache
+* The inpainting model, which is saved in HuggingFace's cache and includes `inpaint` (case-insensitive) in its repo_id, will also be added to the Inpainting Model ID dropdown list.
+  * If there's a specific model you'd like to use, you can cache it in advance using the following Python commands:
+  ```bash
+  python
+  ```
+  ```python
+  from diffusers import StableDiffusionInpaintPipeline
+  pipe = StableDiffusionInpaintPipeline.from_pretrained("Uminosachi/dreamshaper_5-inpainting")
+  exit()
+  ```
+* The model diffusers downloaded is typically stored in your home directory. You can find it at `/home/username/.cache/huggingface/hub` for Linux and MacOS users, or at `C:\Users\username\.cache\huggingface\hub` for Windows users.
+  * When executing inpainting, if the following error is output to the console, try deleting the corresponding model from the cache folder mentioned above:
+  ```
+  An error occurred while trying to fetch model name...
+  ```
+### Cleaner Tab
+* Choose the Cleaner Model ID.
+* Click on the `Run Cleaner` button (**Please note that it may take some time to download the model for the first time**).
+* Cleaner process is performed using [Lama Cleaner](https://github.com/Sanster/lama-cleaner).
+### Mask only Tab
+* Gives ability to just save mask without any other processing, so it's then possible to use the mask in other graphic applications.
+* `Get mask as alpha of image` button: Save the mask as RGBA image, with the mask put into the alpha channel of the input image.
+* `Get mask` button: Save the mask as RGB image.
+![UI image](images/inpaint_anything_ui_image_1.png)
+## Auto-saving images
+* The inpainted image will be automatically saved in the folder that matches the current date within the `outputs` directory.
+## Development
+With the [Inpaint Anything library](README_DEV.md), you can perform segmentation and create masks using sketches from other applications.
+## License
+The source code is licensed under the [Apache 2.0 license](LICENSE).
+## References
+* Ravi, N., Gabeur, V., Hu, Y.-T., Hu, R., Ryali, C., Ma, T., Khedr, H., Rädel, R., Rolland, C., Gustafson, L., Mintun, E., Pan, J., Alwala, K. V., Carion, N., Wu, C.-Y., Girshick, R., Dollár, P., & Feichtenhofer, C. (2024). [SAM 2: Segment Anything in Images and Videos](https://ai.meta.com/research/publications/sam-2-segment-anything-in-images-and-videos/). arXiv preprint.
+* Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A. C., Lo, W-Y., Dollár, P., & Girshick, R. (2023). [Segment Anything](https://arxiv.org/abs/2304.02643). arXiv:2304.02643.
+* Ke, L., Ye, M., Danelljan, M., Liu, Y., Tai, Y-W., Tang, C-K., & Yu, F. (2023). [Segment Anything in High Quality](https://arxiv.org/abs/2306.01567). arXiv:2306.01567.
+* Zhao, X., Ding, W., An, Y., Du, Y., Yu, T., Li, M., Tang, M., & Wang, J. (2023). [Fast Segment Anything](https://arxiv.org/abs/2306.12156). arXiv:2306.12156 [cs.CV].
+* Zhang, C., Han, D., Qiao, Y., Kim, J. U., Bae, S-H., Lee, S., & Hong, C. S. (2023). [Faster Segment Anything: Towards Lightweight SAM for Mobile Applications](https://arxiv.org/abs/2306.14289). arXiv:2306.14289.

README_DEV.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# Usage of Inpaint Anything Library
+## Introduction
+The `inpalib` from the `inpaint-anything` package lets you segment images and create masks using sketches from other applications.
+## Code Breakdown
+### Imports and Module Initialization
+```python
+import importlib
+import numpy as np
+from PIL import Image, ImageDraw
+inpalib = importlib.import_module("inpaint-anything.inpalib")
+```
+### Fetch Model IDs
+```python
+available_sam_ids = inpalib.get_available_sam_ids()
+use_sam_id = "sam_hq_vit_l.pth"
+# assert use_sam_id in available_sam_ids, f"Invalid SAM ID: {use_sam_id}"
+```
+Note: Only the models downloaded via the Inpaint Anything are available.
+### Generate Segments Image
+```python
+input_image = np.array(Image.open("/path/to/image.png"))
+sam_masks = inpalib.generate_sam_masks(input_image, use_sam_id, anime_style_chk=False)
+sam_masks = inpalib.sort_masks_by_area(sam_masks)
+seg_color_image = inpalib.create_seg_color_image(input_image, sam_masks)
+Image.fromarray(seg_color_image).save("/path/to/seg_color_image.png")
+```
+<img src="images/sample_input_image.png" alt="drawing" width="256"/> <img src="images/sample_seg_color_image.png" alt="drawing" width="256"/>
+### Create Mask from Sketch
+```python
+sketch_image = Image.fromarray(np.zeros_like(input_image))
+draw = ImageDraw.Draw(sketch_image)
+draw.point((input_image.shape[1] // 2, input_image.shape[0] // 2), fill=(255, 255, 255))
+mask_image = inpalib.create_mask_image(np.array(sketch_image), sam_masks, ignore_black_chk=True)
+Image.fromarray(mask_image).save("/path/to/mask_image.png")
+```
+<img src="images/sample_mask_image.png" alt="drawing" width="256"/>
+Note: Ensure you adjust the file paths before executing the code.

fast_sam/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .fast_sam_wrapper import FastSAM
+from .fast_sam_wrapper import FastSamAutomaticMaskGenerator
+fast_sam_model_registry = {
+    "FastSAM-x": FastSAM,
+    "FastSAM-s": FastSAM,
+}
+__all__ = ["FastSAM", "FastSamAutomaticMaskGenerator", "fast_sam_model_registry"]

fast_sam/fast_sam_wrapper.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import inspect
+import math
+from typing import Any, Dict, List
+import cv2
+import numpy as np
+import torch
+import ultralytics
+if hasattr(ultralytics, "FastSAM"):
+    from ultralytics import FastSAM as YOLO
+else:
+    from ultralytics import YOLO
+class FastSAM:
+    def __init__(
+        self,
+        checkpoint: str,
+    ) -> None:
+        self.model_path = checkpoint
+        self.model = YOLO(self.model_path)
+        if not hasattr(torch.nn.Upsample, "recompute_scale_factor"):
+            torch.nn.Upsample.recompute_scale_factor = None
+    def to(self, device) -> None:
+        self.model.to(device)
+    @property
+    def device(self) -> Any:
+        return self.model.device
+    def __call__(self, source=None, stream=False, **kwargs) -> Any:
+        return self.model(source=source, stream=stream, **kwargs)
+class FastSamAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model: FastSAM,
+        points_per_batch: int = None,
+        pred_iou_thresh: float = None,
+        stability_score_thresh: float = None,
+    ) -> None:
+        self.model = model
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.conf = 0.25 if stability_score_thresh >= 0.95 else 0.15
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        height, width = image.shape[:2]
+        new_height = math.ceil(height / 32) * 32
+        new_width = math.ceil(width / 32) * 32
+        resize_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
+        backup_nn_dict = {}
+        for key, _ in torch.nn.__dict__.copy().items():
+            if not inspect.isclass(torch.nn.__dict__.get(key)) and "Norm" in key:
+                backup_nn_dict[key] = torch.nn.__dict__.pop(key)
+        results = self.model(
+            source=resize_image,
+            stream=False,
+            imgsz=max(new_height, new_width),
+            device=self.model.device,
+            retina_masks=True,
+            iou=0.7,
+            conf=self.conf,
+            max_det=256)
+        for key, value in backup_nn_dict.items():
+            setattr(torch.nn, key, value)
+            # assert backup_nn_dict[key] == torch.nn.__dict__[key]
+        annotations = results[0].masks.data
+        if isinstance(annotations[0], torch.Tensor):
+            annotations = np.array(annotations.cpu())
+        annotations_list = []
+        for mask in annotations:
+            mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
+            mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((7, 7), np.uint8))
+            mask = cv2.resize(mask, (width, height), interpolation=cv2.INTER_AREA)
+            annotations_list.append(dict(segmentation=mask.astype(bool)))
+        return annotations_list

ia_check_versions.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from functools import cached_property
+from importlib.metadata import version
+from importlib.util import find_spec
+import torch
+from packaging.version import parse
+def get_module_version(module_name):
+    try:
+        module_version = version(module_name)
+    except Exception:
+        module_version = None
+    return module_version
+def compare_version(version1, version2):
+    if not isinstance(version1, str) or not isinstance(version2, str):
+        return None
+    if parse(version1) > parse(version2):
+        return 1
+    elif parse(version1) < parse(version2):
+        return -1
+    else:
+        return 0
+def compare_module_version(module_name, version_string):
+    module_version = get_module_version(module_name)
+    result = compare_version(module_version, version_string)
+    return result if result is not None else -2
+class IACheckVersions:
+    @cached_property
+    def diffusers_enable_cpu_offload(self):
+        if (find_spec("diffusers") is not None and compare_module_version("diffusers", "0.15.0") >= 0 and
+                find_spec("accelerate") is not None and compare_module_version("accelerate", "0.17.0") >= 0 and
+                torch.cuda.is_available()):
+            return True
+        else:
+            return False
+    @cached_property
+    def torch_mps_is_available(self):
+        if compare_module_version("torch", "2.0.1") < 0:
+            if not getattr(torch, "has_mps", False):
+                return False
+            try:
+                torch.zeros(1).to(torch.device("mps"))
+                return True
+            except Exception:
+                return False
+        else:
+            return torch.backends.mps.is_available() and torch.backends.mps.is_built()
+    @cached_property
+    def torch_on_amd_rocm(self):
+        if find_spec("torch") is not None and "rocm" in version("torch"):
+            return True
+        else:
+            return False
+    @cached_property
+    def gradio_version_is_old(self):
+        if find_spec("gradio") is not None and compare_module_version("gradio", "3.34.0") <= 0:
+            return True
+        else:
+            return False
+ia_check_versions = IACheckVersions()

ia_config.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import configparser
+# import json
+import os
+from types import SimpleNamespace
+from ia_ui_items import get_inp_model_ids, get_sam_model_ids
+class IAConfig:
+    SECTIONS = SimpleNamespace(
+        DEFAULT=configparser.DEFAULTSECT,
+        USER="USER",
+    )
+    KEYS = SimpleNamespace(
+        SAM_MODEL_ID="sam_model_id",
+        INP_MODEL_ID="inp_model_id",
+    )
+    PATHS = SimpleNamespace(
+        INI=os.path.join(os.path.dirname(os.path.realpath(__file__)), "ia_config.ini"),
+    )
+    global_args = {}
+    def __init__(self):
+        self.ids_dict = {}
+        self.ids_dict[IAConfig.KEYS.SAM_MODEL_ID] = {
+            "list": get_sam_model_ids(),
+            "index": 1,
+        }
+        self.ids_dict[IAConfig.KEYS.INP_MODEL_ID] = {
+            "list": get_inp_model_ids(),
+            "index": 0,
+        }
+ia_config = IAConfig()
+def setup_ia_config_ini():
+    ia_config_ini = configparser.ConfigParser(defaults={})
+    if os.path.isfile(IAConfig.PATHS.INI):
+        ia_config_ini.read(IAConfig.PATHS.INI, encoding="utf-8")
+    changed = False
+    for key, ids_info in ia_config.ids_dict.items():
+        if not ia_config_ini.has_option(IAConfig.SECTIONS.DEFAULT, key):
+            if len(ids_info["list"]) > ids_info["index"]:
+                ia_config_ini[IAConfig.SECTIONS.DEFAULT][key] = ids_info["list"][ids_info["index"]]
+                changed = True
+        else:
+            if len(ids_info["list"]) > ids_info["index"] and ia_config_ini[IAConfig.SECTIONS.DEFAULT][key] != ids_info["list"][ids_info["index"]]:
+                ia_config_ini[IAConfig.SECTIONS.DEFAULT][key] = ids_info["list"][ids_info["index"]]
+                changed = True
+    if changed:
+        with open(IAConfig.PATHS.INI, "w", encoding="utf-8") as f:
+            ia_config_ini.write(f)
+def get_ia_config(key, section=IAConfig.SECTIONS.DEFAULT):
+    setup_ia_config_ini()
+    ia_config_ini = configparser.ConfigParser(defaults={})
+    ia_config_ini.read(IAConfig.PATHS.INI, encoding="utf-8")
+    if ia_config_ini.has_option(section, key):
+        return ia_config_ini[section][key]
+    section = IAConfig.SECTIONS.DEFAULT
+    if ia_config_ini.has_option(section, key):
+        return ia_config_ini[section][key]
+    return None
+def get_ia_config_index(key, section=IAConfig.SECTIONS.DEFAULT):
+    value = get_ia_config(key, section)
+    ids_dict = ia_config.ids_dict
+    if value is None:
+        if key in ids_dict.keys():
+            ids_info = ids_dict[key]
+            return ids_info["index"]
+        else:
+            return 0
+    else:
+        if key in ids_dict.keys():
+            ids_info = ids_dict[key]
+            return ids_info["list"].index(value) if value in ids_info["list"] else ids_info["index"]
+        else:
+            return 0
+def set_ia_config(key, value, section=IAConfig.SECTIONS.DEFAULT):
+    setup_ia_config_ini()
+    ia_config_ini = configparser.ConfigParser(defaults={})
+    ia_config_ini.read(IAConfig.PATHS.INI, encoding="utf-8")
+    if ia_config_ini.has_option(section, key) and ia_config_ini[section][key] == value:
+        return
+    if section != IAConfig.SECTIONS.DEFAULT and not ia_config_ini.has_section(section):
+        ia_config_ini[section] = {}
+    try:
+        ia_config_ini[section][key] = value
+    except Exception:
+        ia_config_ini[section] = {}
+        ia_config_ini[section][key] = value
+    with open(IAConfig.PATHS.INI, "w", encoding="utf-8") as f:
+        ia_config_ini.write(f)

ia_devices.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torch
+class TorchDevices:
+    def __init__(self):
+        self.cpu = torch.device("cpu")
+        self.device = torch.device("cuda") if torch.cuda.is_available() else self.cpu
+devices = TorchDevices()

ia_file_manager.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+from datetime import datetime
+from huggingface_hub import snapshot_download
+from ia_logging import ia_logging
+class IAFileManager:
+    DOWNLOAD_COMPLETE = "Download complete"
+    def __init__(self) -> None:
+        self._ia_outputs_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
+                                            "outputs",
+                                            datetime.now().strftime("%Y-%m-%d"))
+        self._ia_models_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "models")
+    @property
+    def outputs_dir(self) -> str:
+        """Get inpaint-anything outputs directory.
+        Returns:
+            str: inpaint-anything outputs directory
+        """
+        if not os.path.isdir(self._ia_outputs_dir):
+            os.makedirs(self._ia_outputs_dir, exist_ok=True)
+        return self._ia_outputs_dir
+    @property
+    def models_dir(self) -> str:
+        """Get inpaint-anything models directory.
+        Returns:
+            str: inpaint-anything models directory
+        """
+        if not os.path.isdir(self._ia_models_dir):
+            os.makedirs(self._ia_models_dir, exist_ok=True)
+        return self._ia_models_dir
+    @property
+    def savename_prefix(self) -> str:
+        """Get inpaint-anything savename prefix.
+        Returns:
+            str: inpaint-anything savename prefix
+        """
+        return datetime.now().strftime("%Y%m%d-%H%M%S")
+ia_file_manager = IAFileManager()
+def download_model_from_hf(hf_model_id, local_files_only=False):
+    """Download model from HuggingFace Hub.
+    Args:
+        sam_model_id (str): HuggingFace model id
+        local_files_only (bool, optional): If True, use only local files. Defaults to False.
+    Returns:
+        str: download status
+    """
+    if not local_files_only:
+        ia_logging.info(f"Downloading {hf_model_id}")
+    try:
+        snapshot_download(repo_id=hf_model_id, local_files_only=local_files_only)
+    except FileNotFoundError:
+        return f"{hf_model_id} not found, please download"
+    except Exception as e:
+        return str(e)
+    return IAFileManager.DOWNLOAD_COMPLETE

ia_get_dataset_colormap.py ADDED Viewed

	@@ -0,0 +1,416 @@

+# Lint as: python2, python3
+# Copyright 2018 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Visualizes the segmentation results via specified color map.
+Visualizes the semantic segmentation results by the color map
+defined by the different datasets. Supported colormaps are:
+* ADE20K (http://groups.csail.mit.edu/vision/datasets/ADE20K/).
+* Cityscapes dataset (https://www.cityscapes-dataset.com).
+* Mapillary Vistas (https://research.mapillary.com).
+* PASCAL VOC 2012 (http://host.robots.ox.ac.uk/pascal/VOC/).
+"""
+from __future__ import absolute_import, division, print_function
+import numpy as np
+# from six.moves import range
+# Dataset names.
+_ADE20K = 'ade20k'
+_CITYSCAPES = 'cityscapes'
+_MAPILLARY_VISTAS = 'mapillary_vistas'
+_PASCAL = 'pascal'
+# Max number of entries in the colormap for each dataset.
+_DATASET_MAX_ENTRIES = {
+    _ADE20K: 151,
+    _CITYSCAPES: 256,
+    _MAPILLARY_VISTAS: 66,
+    _PASCAL: 512,
+}
+def create_ade20k_label_colormap():
+    """Creates a label colormap used in ADE20K segmentation benchmark.
+    Returns:
+      A colormap for visualizing segmentation results.
+    """
+    return np.asarray([
+        [0, 0, 0],
+        [120, 120, 120],
+        [180, 120, 120],
+        [6, 230, 230],
+        [80, 50, 50],
+        [4, 200, 3],
+        [120, 120, 80],
+        [140, 140, 140],
+        [204, 5, 255],
+        [230, 230, 230],
+        [4, 250, 7],
+        [224, 5, 255],
+        [235, 255, 7],
+        [150, 5, 61],
+        [120, 120, 70],
+        [8, 255, 51],
+        [255, 6, 82],
+        [143, 255, 140],
+        [204, 255, 4],
+        [255, 51, 7],
+        [204, 70, 3],
+        [0, 102, 200],
+        [61, 230, 250],
+        [255, 6, 51],
+        [11, 102, 255],
+        [255, 7, 71],
+        [255, 9, 224],
+        [9, 7, 230],
+        [220, 220, 220],
+        [255, 9, 92],
+        [112, 9, 255],
+        [8, 255, 214],
+        [7, 255, 224],
+        [255, 184, 6],
+        [10, 255, 71],
+        [255, 41, 10],
+        [7, 255, 255],
+        [224, 255, 8],
+        [102, 8, 255],
+        [255, 61, 6],
+        [255, 194, 7],
+        [255, 122, 8],
+        [0, 255, 20],
+        [255, 8, 41],
+        [255, 5, 153],
+        [6, 51, 255],
+        [235, 12, 255],
+        [160, 150, 20],
+        [0, 163, 255],
+        [140, 140, 140],
+        [250, 10, 15],
+        [20, 255, 0],
+        [31, 255, 0],
+        [255, 31, 0],
+        [255, 224, 0],
+        [153, 255, 0],
+        [0, 0, 255],
+        [255, 71, 0],
+        [0, 235, 255],
+        [0, 173, 255],
+        [31, 0, 255],
+        [11, 200, 200],
+        [255, 82, 0],
+        [0, 255, 245],
+        [0, 61, 255],
+        [0, 255, 112],
+        [0, 255, 133],
+        [255, 0, 0],
+        [255, 163, 0],
+        [255, 102, 0],
+        [194, 255, 0],
+        [0, 143, 255],
+        [51, 255, 0],
+        [0, 82, 255],
+        [0, 255, 41],
+        [0, 255, 173],
+        [10, 0, 255],
+        [173, 255, 0],
+        [0, 255, 153],
+        [255, 92, 0],
+        [255, 0, 255],
+        [255, 0, 245],
+        [255, 0, 102],
+        [255, 173, 0],
+        [255, 0, 20],
+        [255, 184, 184],
+        [0, 31, 255],
+        [0, 255, 61],
+        [0, 71, 255],
+        [255, 0, 204],
+        [0, 255, 194],
+        [0, 255, 82],
+        [0, 10, 255],
+        [0, 112, 255],
+        [51, 0, 255],
+        [0, 194, 255],
+        [0, 122, 255],
+        [0, 255, 163],
+        [255, 153, 0],
+        [0, 255, 10],
+        [255, 112, 0],
+        [143, 255, 0],
+        [82, 0, 255],
+        [163, 255, 0],
+        [255, 235, 0],
+        [8, 184, 170],
+        [133, 0, 255],
+        [0, 255, 92],
+        [184, 0, 255],
+        [255, 0, 31],
+        [0, 184, 255],
+        [0, 214, 255],
+        [255, 0, 112],
+        [92, 255, 0],
+        [0, 224, 255],
+        [112, 224, 255],
+        [70, 184, 160],
+        [163, 0, 255],
+        [153, 0, 255],
+        [71, 255, 0],
+        [255, 0, 163],
+        [255, 204, 0],
+        [255, 0, 143],
+        [0, 255, 235],
+        [133, 255, 0],
+        [255, 0, 235],
+        [245, 0, 255],
+        [255, 0, 122],
+        [255, 245, 0],
+        [10, 190, 212],
+        [214, 255, 0],
+        [0, 204, 255],
+        [20, 0, 255],
+        [255, 255, 0],
+        [0, 153, 255],
+        [0, 41, 255],
+        [0, 255, 204],
+        [41, 0, 255],
+        [41, 255, 0],
+        [173, 0, 255],
+        [0, 245, 255],
+        [71, 0, 255],
+        [122, 0, 255],
+        [0, 255, 184],
+        [0, 92, 255],
+        [184, 255, 0],
+        [0, 133, 255],
+        [255, 214, 0],
+        [25, 194, 194],
+        [102, 255, 0],
+        [92, 0, 255],
+    ])
+def create_cityscapes_label_colormap():
+    """Creates a label colormap used in CITYSCAPES segmentation benchmark.
+    Returns:
+      A colormap for visualizing segmentation results.
+    """
+    colormap = np.zeros((256, 3), dtype=np.uint8)
+    colormap[0] = [128, 64, 128]
+    colormap[1] = [244, 35, 232]
+    colormap[2] = [70, 70, 70]
+    colormap[3] = [102, 102, 156]
+    colormap[4] = [190, 153, 153]
+    colormap[5] = [153, 153, 153]
+    colormap[6] = [250, 170, 30]
+    colormap[7] = [220, 220, 0]
+    colormap[8] = [107, 142, 35]
+    colormap[9] = [152, 251, 152]
+    colormap[10] = [70, 130, 180]
+    colormap[11] = [220, 20, 60]
+    colormap[12] = [255, 0, 0]
+    colormap[13] = [0, 0, 142]
+    colormap[14] = [0, 0, 70]
+    colormap[15] = [0, 60, 100]
+    colormap[16] = [0, 80, 100]
+    colormap[17] = [0, 0, 230]
+    colormap[18] = [119, 11, 32]
+    return colormap
+def create_mapillary_vistas_label_colormap():
+    """Creates a label colormap used in Mapillary Vistas segmentation benchmark.
+    Returns:
+      A colormap for visualizing segmentation results.
+    """
+    return np.asarray([
+        [165, 42, 42],
+        [0, 192, 0],
+        [196, 196, 196],
+        [190, 153, 153],
+        [180, 165, 180],
+        [102, 102, 156],
+        [102, 102, 156],
+        [128, 64, 255],
+        [140, 140, 200],
+        [170, 170, 170],
+        [250, 170, 160],
+        [96, 96, 96],
+        [230, 150, 140],
+        [128, 64, 128],
+        [110, 110, 110],
+        [244, 35, 232],
+        [150, 100, 100],
+        [70, 70, 70],
+        [150, 120, 90],
+        [220, 20, 60],
+        [255, 0, 0],
+        [255, 0, 0],
+        [255, 0, 0],
+        [200, 128, 128],
+        [255, 255, 255],
+        [64, 170, 64],
+        [128, 64, 64],
+        [70, 130, 180],
+        [255, 255, 255],
+        [152, 251, 152],
+        [107, 142, 35],
+        [0, 170, 30],
+        [255, 255, 128],
+        [250, 0, 30],
+        [0, 0, 0],
+        [220, 220, 220],
+        [170, 170, 170],
+        [222, 40, 40],
+        [100, 170, 30],
+        [40, 40, 40],
+        [33, 33, 33],
+        [170, 170, 170],
+        [0, 0, 142],
+        [170, 170, 170],
+        [210, 170, 100],
+        [153, 153, 153],
+        [128, 128, 128],
+        [0, 0, 142],
+        [250, 170, 30],
+        [192, 192, 192],
+        [220, 220, 0],
+        [180, 165, 180],
+        [119, 11, 32],
+        [0, 0, 142],
+        [0, 60, 100],
+        [0, 0, 142],
+        [0, 0, 90],
+        [0, 0, 230],
+        [0, 80, 100],
+        [128, 64, 64],
+        [0, 0, 110],
+        [0, 0, 70],
+        [0, 0, 192],
+        [32, 32, 32],
+        [0, 0, 0],
+        [0, 0, 0],
+    ])
+def create_pascal_label_colormap():
+    """Creates a label colormap used in PASCAL VOC segmentation benchmark.
+    Returns:
+      A colormap for visualizing segmentation results.
+    """
+    colormap = np.zeros((_DATASET_MAX_ENTRIES[_PASCAL], 3), dtype=int)
+    ind = np.arange(_DATASET_MAX_ENTRIES[_PASCAL], dtype=int)
+    for shift in reversed(list(range(8))):
+        for channel in range(3):
+            colormap[:, channel] |= bit_get(ind, channel) << shift
+        ind >>= 3
+    return colormap
+def get_ade20k_name():
+    return _ADE20K
+def get_cityscapes_name():
+    return _CITYSCAPES
+def get_mapillary_vistas_name():
+    return _MAPILLARY_VISTAS
+def get_pascal_name():
+    return _PASCAL
+def bit_get(val, idx):
+    """Gets the bit value.
+    Args:
+      val: Input value, int or numpy int array.
+      idx: Which bit of the input val.
+    Returns:
+      The "idx"-th bit of input val.
+    """
+    return (val >> idx) & 1
+def create_label_colormap(dataset=_PASCAL):
+    """Creates a label colormap for the specified dataset.
+    Args:
+      dataset: The colormap used in the dataset.
+    Returns:
+      A numpy array of the dataset colormap.
+    Raises:
+      ValueError: If the dataset is not supported.
+    """
+    if dataset == _ADE20K:
+        return create_ade20k_label_colormap()
+    elif dataset == _CITYSCAPES:
+        return create_cityscapes_label_colormap()
+    elif dataset == _MAPILLARY_VISTAS:
+        return create_mapillary_vistas_label_colormap()
+    elif dataset == _PASCAL:
+        return create_pascal_label_colormap()
+    else:
+        raise ValueError('Unsupported dataset.')
+def label_to_color_image(label, dataset=_PASCAL):
+    """Adds color defined by the dataset colormap to the label.
+    Args:
+      label: A 2D array with integer type, storing the segmentation label.
+      dataset: The colormap used in the dataset.
+    Returns:
+      result: A 2D array with floating type. The element of the array
+        is the color indexed by the corresponding element in the input label
+        to the dataset color map.
+    Raises:
+      ValueError: If label is not of rank 2 or its value is larger than color
+        map maximum entry.
+    """
+    if label.ndim != 2:
+        raise ValueError('Expect 2-D input label. Got {}'.format(label.shape))
+    if np.max(label) >= _DATASET_MAX_ENTRIES[dataset]:
+        raise ValueError(
+            'label value too large: {} >= {}.'.format(
+                np.max(label), _DATASET_MAX_ENTRIES[dataset]))
+    colormap = create_label_colormap(dataset)
+    return colormap[label]
+def get_dataset_colormap_max_entries(dataset):
+    return _DATASET_MAX_ENTRIES[dataset]

ia_logging.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import logging
+import warnings
+warnings.filterwarnings(action="ignore", category=FutureWarning, module="transformers")
+warnings.filterwarnings(action="ignore", category=FutureWarning, module="huggingface_hub")
+ia_logging = logging.getLogger("Inpaint Anything")
+ia_logging.setLevel(logging.INFO)
+ia_logging.propagate = False
+ia_logging_sh = logging.StreamHandler()
+ia_logging_sh.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
+ia_logging_sh.setLevel(logging.INFO)
+ia_logging.addHandler(ia_logging_sh)

ia_sam_manager.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import platform
+from functools import partial
+import torch
+from fast_sam import FastSamAutomaticMaskGenerator, fast_sam_model_registry
+from ia_check_versions import ia_check_versions
+from ia_config import IAConfig
+from ia_devices import devices
+from ia_logging import ia_logging
+from mobile_sam import SamAutomaticMaskGenerator as SamAutomaticMaskGeneratorMobile
+from mobile_sam import SamPredictor as SamPredictorMobile
+from mobile_sam import sam_model_registry as sam_model_registry_mobile
+from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from sam2.build_sam import build_sam2
+from segment_anything_fb import SamAutomaticMaskGenerator, SamPredictor, sam_model_registry
+from segment_anything_hq import SamAutomaticMaskGenerator as SamAutomaticMaskGeneratorHQ
+from segment_anything_hq import SamPredictor as SamPredictorHQ
+from segment_anything_hq import sam_model_registry as sam_model_registry_hq
+def check_bfloat16_support() -> bool:
+    if torch.cuda.is_available():
+        compute_capability = torch.cuda.get_device_capability(torch.cuda.current_device())
+        if compute_capability[0] >= 8:
+            ia_logging.debug("The CUDA device supports bfloat16")
+            return True
+        else:
+            ia_logging.debug("The CUDA device does not support bfloat16")
+            return False
+    else:
+        ia_logging.debug("CUDA is not available")
+        return False
+def partial_from_end(func, /, *fixed_args, **fixed_kwargs):
+    def wrapper(*args, **kwargs):
+        updated_kwargs = {**fixed_kwargs, **kwargs}
+        return func(*args, *fixed_args, **updated_kwargs)
+    return wrapper
+def rename_args(func, arg_map):
+    def wrapper(*args, **kwargs):
+        new_kwargs = {arg_map.get(k, k): v for k, v in kwargs.items()}
+        return func(*args, **new_kwargs)
+    return wrapper
+arg_map = {"checkpoint": "ckpt_path"}
+rename_build_sam2 = rename_args(build_sam2, arg_map)
+end_kwargs = dict(device="cpu", mode="eval", hydra_overrides_extra=[], apply_postprocessing=False)
+sam2_model_registry = {
+    "sam2_hiera_large": partial(partial_from_end(rename_build_sam2, **end_kwargs), "sam2_hiera_l.yaml"),
+    "sam2_hiera_base_plus": partial(partial_from_end(rename_build_sam2, **end_kwargs), "sam2_hiera_b+.yaml"),
+    "sam2_hiera_small": partial(partial_from_end(rename_build_sam2, **end_kwargs), "sam2_hiera_s.yaml"),
+    "sam2_hiera_tiny": partial(partial_from_end(rename_build_sam2, **end_kwargs), "sam2_hiera_t.yaml"),
+}
+def get_sam_mask_generator(sam_checkpoint, anime_style_chk=False):
+    """Get SAM mask generator.
+    Args:
+        sam_checkpoint (str): SAM checkpoint path
+    Returns:
+        SamAutomaticMaskGenerator or None: SAM mask generator
+    """
+    points_per_batch = 64
+    if "_hq_" in os.path.basename(sam_checkpoint):
+        model_type = os.path.basename(sam_checkpoint)[7:12]
+        sam_model_registry_local = sam_model_registry_hq
+        SamAutomaticMaskGeneratorLocal = SamAutomaticMaskGeneratorHQ
+        points_per_batch = 32
+    elif "FastSAM" in os.path.basename(sam_checkpoint):
+        model_type = os.path.splitext(os.path.basename(sam_checkpoint))[0]
+        sam_model_registry_local = fast_sam_model_registry
+        SamAutomaticMaskGeneratorLocal = FastSamAutomaticMaskGenerator
+        points_per_batch = None
+    elif "mobile_sam" in os.path.basename(sam_checkpoint):
+        model_type = "vit_t"
+        sam_model_registry_local = sam_model_registry_mobile
+        SamAutomaticMaskGeneratorLocal = SamAutomaticMaskGeneratorMobile
+        points_per_batch = 64
+    elif "sam2_" in os.path.basename(sam_checkpoint):
+        model_type = os.path.splitext(os.path.basename(sam_checkpoint))[0]
+        sam_model_registry_local = sam2_model_registry
+        SamAutomaticMaskGeneratorLocal = SAM2AutomaticMaskGenerator
+        points_per_batch = 128
+    else:
+        model_type = os.path.basename(sam_checkpoint)[4:9]
+        sam_model_registry_local = sam_model_registry
+        SamAutomaticMaskGeneratorLocal = SamAutomaticMaskGenerator
+        points_per_batch = 64
+    pred_iou_thresh = 0.88 if not anime_style_chk else 0.83
+    stability_score_thresh = 0.95 if not anime_style_chk else 0.9
+    if "sam2_" in model_type:
+        pred_iou_thresh = round(pred_iou_thresh - 0.18, 2)
+        stability_score_thresh = round(stability_score_thresh - 0.03, 2)
+        sam2_gen_kwargs = dict(
+            points_per_side=64,
+            points_per_batch=points_per_batch,
+            pred_iou_thresh=pred_iou_thresh,
+            stability_score_thresh=stability_score_thresh,
+            stability_score_offset=0.7,
+            crop_n_layers=1,
+            box_nms_thresh=0.7,
+            crop_n_points_downscale_factor=2)
+        if platform.system() == "Darwin":
+            sam2_gen_kwargs.update(dict(points_per_side=32, points_per_batch=64, crop_n_points_downscale_factor=1))
+    if os.path.isfile(sam_checkpoint):
+        sam = sam_model_registry_local[model_type](checkpoint=sam_checkpoint)
+        if platform.system() == "Darwin":
+            if "FastSAM" in os.path.basename(sam_checkpoint) or not ia_check_versions.torch_mps_is_available:
+                sam.to(device=torch.device("cpu"))
+            else:
+                sam.to(device=torch.device("mps"))
+        else:
+            if IAConfig.global_args.get("sam_cpu", False):
+                ia_logging.info("SAM is running on CPU... (the option has been selected)")
+                sam.to(device=devices.cpu)
+            else:
+                sam.to(device=devices.device)
+        sam_gen_kwargs = dict(
+            model=sam, points_per_batch=points_per_batch, pred_iou_thresh=pred_iou_thresh, stability_score_thresh=stability_score_thresh)
+        if "sam2_" in model_type:
+            sam_gen_kwargs.update(sam2_gen_kwargs)
+        sam_mask_generator = SamAutomaticMaskGeneratorLocal(**sam_gen_kwargs)
+    else:
+        sam_mask_generator = None
+    return sam_mask_generator
+def get_sam_predictor(sam_checkpoint):
+    """Get SAM predictor.
+    Args:
+        sam_checkpoint (str): SAM checkpoint path
+    Returns:
+        SamPredictor or None: SAM predictor
+    """
+    # model_type = "vit_h"
+    if "_hq_" in os.path.basename(sam_checkpoint):
+        model_type = os.path.basename(sam_checkpoint)[7:12]
+        sam_model_registry_local = sam_model_registry_hq
+        SamPredictorLocal = SamPredictorHQ
+    elif "FastSAM" in os.path.basename(sam_checkpoint):
+        raise NotImplementedError("FastSAM predictor is not implemented yet.")
+    elif "mobile_sam" in os.path.basename(sam_checkpoint):
+        model_type = "vit_t"
+        sam_model_registry_local = sam_model_registry_mobile
+        SamPredictorLocal = SamPredictorMobile
+    else:
+        model_type = os.path.basename(sam_checkpoint)[4:9]
+        sam_model_registry_local = sam_model_registry
+        SamPredictorLocal = SamPredictor
+    if os.path.isfile(sam_checkpoint):
+        sam = sam_model_registry_local[model_type](checkpoint=sam_checkpoint)
+        if platform.system() == "Darwin":
+            if "FastSAM" in os.path.basename(sam_checkpoint) or not ia_check_versions.torch_mps_is_available:
+                sam.to(device=torch.device("cpu"))
+            else:
+                sam.to(device=torch.device("mps"))
+        else:
+            if IAConfig.global_args.get("sam_cpu", False):
+                ia_logging.info("SAM is running on CPU... (the option has been selected)")
+                sam.to(device=devices.cpu)
+            else:
+                sam.to(device=devices.device)
+        sam_predictor = SamPredictorLocal(sam)
+    else:
+        sam_predictor = None
+    return sam_predictor

ia_threading.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import gc
+import inspect
+import threading
+from functools import wraps
+import torch
+from ia_check_versions import ia_check_versions
+model_access_sem = threading.Semaphore(1)
+def torch_gc():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    if ia_check_versions.torch_mps_is_available:
+        if hasattr(torch, "mps") and hasattr(torch.mps, "empty_cache"):
+            torch.mps.empty_cache()
+def clear_cache():
+    gc.collect()
+    torch_gc()
+def post_clear_cache(sem):
+    with sem:
+        gc.collect()
+        torch_gc()
+def async_post_clear_cache():
+    thread = threading.Thread(target=post_clear_cache, args=(model_access_sem,))
+    thread.start()
+def clear_cache_decorator(func):
+    @wraps(func)
+    def yield_wrapper(*args, **kwargs):
+        clear_cache()
+        yield from func(*args, **kwargs)
+        clear_cache()
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        clear_cache()
+        res = func(*args, **kwargs)
+        clear_cache()
+        return res
+    if inspect.isgeneratorfunction(func):
+        return yield_wrapper
+    else:
+        return wrapper

ia_ui_gradio.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import gradio as gr
+GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse
+def webpath(fn):
+    web_path = os.path.realpath(fn)
+    return f'file={web_path}?{os.path.getmtime(fn)}'
+def javascript_html():
+    script_path = os.path.join(os.path.dirname(__file__), "javascript", "inpaint-anything.js")
+    head = f'<script type="text/javascript" src="{webpath(script_path)}"></script>\n'
+    return head
+def reload_javascript():
+    js = javascript_html()
+    def template_response(*args, **kwargs):
+        res = GradioTemplateResponseOriginal(*args, **kwargs)
+        res.body = res.body.replace(b'</head>', f'{js}</head>'.encode("utf8"))
+        res.init_headers()
+        return res
+    gr.routes.templates.TemplateResponse = template_response

ia_ui_items.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from huggingface_hub import scan_cache_dir
+def get_sampler_names():
+    """Get sampler name list.
+    Returns:
+        list: sampler name list
+    """
+    sampler_names = [
+        "DDIM",
+        "Euler",
+        "Euler a",
+        "DPM2 Karras",
+        "DPM2 a Karras",
+    ]
+    return sampler_names
+def get_sam_model_ids():
+    """Get SAM model ids list.
+    Returns:
+        list: SAM model ids list
+    """
+    sam_model_ids = [
+        "sam2_hiera_large.pt",
+        "sam2_hiera_base_plus.pt",
+        "sam2_hiera_small.pt",
+        "sam2_hiera_tiny.pt",
+        "sam_vit_h_4b8939.pth",
+        "sam_vit_l_0b3195.pth",
+        "sam_vit_b_01ec64.pth",
+        "sam_hq_vit_h.pth",
+        "sam_hq_vit_l.pth",
+        "sam_hq_vit_b.pth",
+        "FastSAM-x.pt",
+        "FastSAM-s.pt",
+        "mobile_sam.pt",
+    ]
+    return sam_model_ids
+inp_list_from_cache = None
+def get_inp_model_ids():
+    """Get inpainting model ids list.
+    Returns:
+        list: model ids list
+    """
+    global inp_list_from_cache
+    model_ids = [
+        "stabilityai/stable-diffusion-2-inpainting",
+        "Uminosachi/dreamshaper_8Inpainting",
+        "Uminosachi/deliberate_v3-inpainting",
+        "Uminosachi/realisticVisionV51_v51VAE-inpainting",
+        "Uminosachi/revAnimated_v121Inp-inpainting",
+        "runwayml/stable-diffusion-inpainting",
+    ]
+    if inp_list_from_cache is not None and isinstance(inp_list_from_cache, list):
+        model_ids.extend(inp_list_from_cache)
+        return model_ids
+    try:
+        hf_cache_info = scan_cache_dir()
+        inpaint_repos = []
+        for repo in hf_cache_info.repos:
+            if repo.repo_type == "model" and "inpaint" in repo.repo_id.lower() and repo.repo_id not in model_ids:
+                inpaint_repos.append(repo.repo_id)
+        inp_list_from_cache = sorted(inpaint_repos, reverse=True, key=lambda x: x.split("/")[-1])
+        model_ids.extend(inp_list_from_cache)
+        return model_ids
+    except Exception:
+        return model_ids
+def get_cleaner_model_ids():
+    """Get cleaner model ids list.
+    Returns:
+        list: model ids list
+    """
+    model_ids = [
+        "lama",
+        "ldm",
+        "zits",
+        "mat",
+        "fcf",
+        "manga",
+    ]
+    return model_ids
+def get_padding_mode_names():
+    """Get padding mode name list.
+    Returns:
+        list: padding mode name list
+    """
+    padding_mode_names = [
+        "constant",
+        "edge",
+        "reflect",
+        "mean",
+        "median",
+        "maximum",
+        "minimum",
+    ]
+    return padding_mode_names

iasam_app.py ADDED Viewed

	@@ -0,0 +1,809 @@

+import argparse
+# import math
+import gc
+import os
+import platform
+if platform.system() == "Darwin":
+    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+if platform.system() == "Windows":
+    os.environ["XFORMERS_FORCE_DISABLE_TRITON"] = "1"
+import random
+import traceback
+from importlib.util import find_spec
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from diffusers import (DDIMScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
+                       KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler,
+                       StableDiffusionInpaintPipeline)
+from PIL import Image, ImageFilter
+from PIL.PngImagePlugin import PngInfo
+from torch.hub import download_url_to_file
+from torchvision import transforms
+import inpalib
+from ia_check_versions import ia_check_versions
+from ia_config import IAConfig, get_ia_config_index, set_ia_config, setup_ia_config_ini
+from ia_devices import devices
+from ia_file_manager import IAFileManager, download_model_from_hf, ia_file_manager
+from ia_logging import ia_logging
+from ia_threading import clear_cache_decorator
+from ia_ui_gradio import reload_javascript
+from ia_ui_items import (get_cleaner_model_ids, get_inp_model_ids, get_padding_mode_names,
+                         get_sam_model_ids, get_sampler_names)
+from lama_cleaner.model_manager import ModelManager
+from lama_cleaner.schema import Config, HDStrategy, LDMSampler, SDSampler
+print("platform:", platform.system())
+reload_javascript()
+if find_spec("xformers") is not None:
+    xformers_available = True
+else:
+    xformers_available = False
+parser = argparse.ArgumentParser(description="Inpaint Anything")
+parser.add_argument("--save-seg", action="store_true", help="Save the segmentation image generated by SAM.")
+parser.add_argument("--offline", action="store_true", help="Execute inpainting using an offline network.")
+parser.add_argument("--sam-cpu", action="store_true", help="Perform the Segment Anything operation on CPU.")
+args = parser.parse_args()
+IAConfig.global_args.update(args.__dict__)
+@clear_cache_decorator
+def download_model(sam_model_id):
+    """Download SAM model.
+    Args:
+        sam_model_id (str): SAM model id
+    Returns:
+        str: download status
+    """
+    if "_hq_" in sam_model_id:
+        url_sam = "https://huggingface.co/Uminosachi/sam-hq/resolve/main/" + sam_model_id
+    elif "FastSAM" in sam_model_id:
+        url_sam = "https://huggingface.co/Uminosachi/FastSAM/resolve/main/" + sam_model_id
+    elif "mobile_sam" in sam_model_id:
+        url_sam = "https://huggingface.co/Uminosachi/MobileSAM/resolve/main/" + sam_model_id
+    elif "sam2_" in sam_model_id:
+        url_sam = "https://dl.fbaipublicfiles.com/segment_anything_2/072824/" + sam_model_id
+    else:
+        url_sam = "https://dl.fbaipublicfiles.com/segment_anything/" + sam_model_id
+    sam_checkpoint = os.path.join(ia_file_manager.models_dir, sam_model_id)
+    if not os.path.isfile(sam_checkpoint):
+        try:
+            download_url_to_file(url_sam, sam_checkpoint)
+        except Exception as e:
+            ia_logging.error(str(e))
+            return str(e)
+        return IAFileManager.DOWNLOAD_COMPLETE
+    else:
+        return "Model already exists"
+sam_dict = dict(sam_masks=None, mask_image=None, cnet=None, orig_image=None, pad_mask=None)
+def save_mask_image(mask_image, save_mask_chk=False):
+    """Save mask image.
+    Args:
+        mask_image (np.ndarray): mask image
+        save_mask_chk (bool, optional): If True, save mask image. Defaults to False.
+    Returns:
+        None
+    """
+    if save_mask_chk:
+        save_name = "_".join([ia_file_manager.savename_prefix, "created_mask"]) + ".png"
+        save_name = os.path.join(ia_file_manager.outputs_dir, save_name)
+        Image.fromarray(mask_image).save(save_name)
+@clear_cache_decorator
+def input_image_upload(input_image, sam_image, sel_mask):
+    global sam_dict
+    sam_dict["orig_image"] = input_image
+    sam_dict["pad_mask"] = None
+    if (sam_dict["mask_image"] is None or not isinstance(sam_dict["mask_image"], np.ndarray) or
+            sam_dict["mask_image"].shape != input_image.shape):
+        sam_dict["mask_image"] = np.zeros_like(input_image, dtype=np.uint8)
+    ret_sel_image = cv2.addWeighted(input_image, 0.5, sam_dict["mask_image"], 0.5, 0)
+    if sam_image is None or not isinstance(sam_image, dict) or "image" not in sam_image:
+        sam_dict["sam_masks"] = None
+        ret_sam_image = np.zeros_like(input_image, dtype=np.uint8)
+    elif sam_image["image"].shape == input_image.shape:
+        ret_sam_image = gr.update()
+    else:
+        sam_dict["sam_masks"] = None
+        ret_sam_image = gr.update(value=np.zeros_like(input_image, dtype=np.uint8))
+    if sel_mask is None or not isinstance(sel_mask, dict) or "image" not in sel_mask:
+        ret_sel_mask = ret_sel_image
+    elif sel_mask["image"].shape == ret_sel_image.shape and np.all(sel_mask["image"] == ret_sel_image):
+        ret_sel_mask = gr.update()
+    else:
+        ret_sel_mask = gr.update(value=ret_sel_image)
+    return ret_sam_image, ret_sel_mask, gr.update(interactive=True)
+@clear_cache_decorator
+def run_padding(input_image, pad_scale_width, pad_scale_height, pad_lr_barance, pad_tb_barance, padding_mode="edge"):
+    global sam_dict
+    if input_image is None or sam_dict["orig_image"] is None:
+        sam_dict["orig_image"] = None
+        sam_dict["pad_mask"] = None
+        return None, "Input image not found"
+    orig_image = sam_dict["orig_image"]
+    height, width = orig_image.shape[:2]
+    pad_width, pad_height = (int(width * pad_scale_width), int(height * pad_scale_height))
+    ia_logging.info(f"resize by padding: ({height}, {width}) -> ({pad_height}, {pad_width})")
+    pad_size_w, pad_size_h = (pad_width - width, pad_height - height)
+    pad_size_l = int(pad_size_w * pad_lr_barance)
+    pad_size_r = pad_size_w - pad_size_l
+    pad_size_t = int(pad_size_h * pad_tb_barance)
+    pad_size_b = pad_size_h - pad_size_t
+    pad_width = [(pad_size_t, pad_size_b), (pad_size_l, pad_size_r), (0, 0)]
+    if padding_mode == "constant":
+        fill_value = 127
+        pad_image = np.pad(orig_image, pad_width=pad_width, mode=padding_mode, constant_values=fill_value)
+    else:
+        pad_image = np.pad(orig_image, pad_width=pad_width, mode=padding_mode)
+    mask_pad_width = [(pad_size_t, pad_size_b), (pad_size_l, pad_size_r)]
+    pad_mask = np.zeros((height, width), dtype=np.uint8)
+    pad_mask = np.pad(pad_mask, pad_width=mask_pad_width, mode="constant", constant_values=255)
+    sam_dict["pad_mask"] = dict(segmentation=pad_mask.astype(bool))
+    return pad_image, "Padding done"
+@clear_cache_decorator
+def run_sam(input_image, sam_model_id, sam_image, anime_style_chk=False):
+    global sam_dict
+    if not inpalib.sam_file_exists(sam_model_id):
+        ret_sam_image = None if sam_image is None else gr.update()
+        return ret_sam_image, f"{sam_model_id} not found, please download"
+    if input_image is None:
+        ret_sam_image = None if sam_image is None else gr.update()
+        return ret_sam_image, "Input image not found"
+    set_ia_config(IAConfig.KEYS.SAM_MODEL_ID, sam_model_id, IAConfig.SECTIONS.USER)
+    if sam_dict["sam_masks"] is not None:
+        sam_dict["sam_masks"] = None
+        gc.collect()
+    ia_logging.info(f"input_image: {input_image.shape} {input_image.dtype}")
+    try:
+        sam_masks = inpalib.generate_sam_masks(input_image, sam_model_id, anime_style_chk)
+        sam_masks = inpalib.sort_masks_by_area(sam_masks)
+        sam_masks = inpalib.insert_mask_to_sam_masks(sam_masks, sam_dict["pad_mask"])
+        seg_image = inpalib.create_seg_color_image(input_image, sam_masks)
+        sam_dict["sam_masks"] = sam_masks
+    except Exception as e:
+        print(traceback.format_exc())
+        ia_logging.error(str(e))
+        ret_sam_image = None if sam_image is None else gr.update()
+        return ret_sam_image, "Segment Anything failed"
+    if IAConfig.global_args.get("save_seg", False):
+        save_name = "_".join([ia_file_manager.savename_prefix, os.path.splitext(sam_model_id)[0]]) + ".png"
+        save_name = os.path.join(ia_file_manager.outputs_dir, save_name)
+        Image.fromarray(seg_image).save(save_name)
+    if sam_image is None:
+        return seg_image, "Segment Anything complete"
+    else:
+        if sam_image["image"].shape == seg_image.shape and np.all(sam_image["image"] == seg_image):
+            return gr.update(), "Segment Anything complete"
+        else:
+            return gr.update(value=seg_image), "Segment Anything complete"
+@clear_cache_decorator
+def select_mask(input_image, sam_image, invert_chk, ignore_black_chk, sel_mask):
+    global sam_dict
+    if sam_dict["sam_masks"] is None or sam_image is None:
+        ret_sel_mask = None if sel_mask is None else gr.update()
+        return ret_sel_mask
+    sam_masks = sam_dict["sam_masks"]
+    # image = sam_image["image"]
+    mask = sam_image["mask"][:, :, 0:1]
+    try:
+        seg_image = inpalib.create_mask_image(mask, sam_masks, ignore_black_chk)
+        if invert_chk:
+            seg_image = inpalib.invert_mask(seg_image)
+        sam_dict["mask_image"] = seg_image
+    except Exception as e:
+        print(traceback.format_exc())
+        ia_logging.error(str(e))
+        ret_sel_mask = None if sel_mask is None else gr.update()
+        return ret_sel_mask
+    if input_image is not None and input_image.shape == seg_image.shape:
+        ret_image = cv2.addWeighted(input_image, 0.5, seg_image, 0.5, 0)
+    else:
+        ret_image = seg_image
+    if sel_mask is None:
+        return ret_image
+    else:
+        if sel_mask["image"].shape == ret_image.shape and np.all(sel_mask["image"] == ret_image):
+            return gr.update()
+        else:
+            return gr.update(value=ret_image)
+@clear_cache_decorator
+def expand_mask(input_image, sel_mask, expand_iteration=1):
+    global sam_dict
+    if sam_dict["mask_image"] is None or sel_mask is None:
+        return None
+    new_sel_mask = sam_dict["mask_image"]
+    expand_iteration = int(np.clip(expand_iteration, 1, 100))
+    new_sel_mask = cv2.dilate(new_sel_mask, np.ones((3, 3), dtype=np.uint8), iterations=expand_iteration)
+    sam_dict["mask_image"] = new_sel_mask
+    if input_image is not None and input_image.shape == new_sel_mask.shape:
+        ret_image = cv2.addWeighted(input_image, 0.5, new_sel_mask, 0.5, 0)
+    else:
+        ret_image = new_sel_mask
+    if sel_mask["image"].shape == ret_image.shape and np.all(sel_mask["image"] == ret_image):
+        return gr.update()
+    else:
+        return gr.update(value=ret_image)
+@clear_cache_decorator
+def apply_mask(input_image, sel_mask):
+    global sam_dict
+    if sam_dict["mask_image"] is None or sel_mask is None:
+        return None
+    sel_mask_image = sam_dict["mask_image"]
+    sel_mask_mask = np.logical_not(sel_mask["mask"][:, :, 0:3].astype(bool)).astype(np.uint8)
+    new_sel_mask = sel_mask_image * sel_mask_mask
+    sam_dict["mask_image"] = new_sel_mask
+    if input_image is not None and input_image.shape == new_sel_mask.shape:
+        ret_image = cv2.addWeighted(input_image, 0.5, new_sel_mask, 0.5, 0)
+    else:
+        ret_image = new_sel_mask
+    if sel_mask["image"].shape == ret_image.shape and np.all(sel_mask["image"] == ret_image):
+        return gr.update()
+    else:
+        return gr.update(value=ret_image)
+@clear_cache_decorator
+def add_mask(input_image, sel_mask):
+    global sam_dict
+    if sam_dict["mask_image"] is None or sel_mask is None:
+        return None
+    sel_mask_image = sam_dict["mask_image"]
+    sel_mask_mask = sel_mask["mask"][:, :, 0:3].astype(bool).astype(np.uint8)
+    new_sel_mask = sel_mask_image + (sel_mask_mask * np.invert(sel_mask_image, dtype=np.uint8))
+    sam_dict["mask_image"] = new_sel_mask
+    if input_image is not None and input_image.shape == new_sel_mask.shape:
+        ret_image = cv2.addWeighted(input_image, 0.5, new_sel_mask, 0.5, 0)
+    else:
+        ret_image = new_sel_mask
+    if sel_mask["image"].shape == ret_image.shape and np.all(sel_mask["image"] == ret_image):
+        return gr.update()
+    else:
+        return gr.update(value=ret_image)
+def auto_resize_to_pil(input_image, mask_image):
+    init_image = Image.fromarray(input_image).convert("RGB")
+    mask_image = Image.fromarray(mask_image).convert("RGB")
+    assert init_image.size == mask_image.size, "The sizes of the image and mask do not match"
+    width, height = init_image.size
+    new_height = (height // 8) * 8
+    new_width = (width // 8) * 8
+    if new_width < width or new_height < height:
+        if (new_width / width) < (new_height / height):
+            scale = new_height / height
+        else:
+            scale = new_width / width
+        resize_height = int(height*scale+0.5)
+        resize_width = int(width*scale+0.5)
+        if height != resize_height or width != resize_width:
+            ia_logging.info(f"resize: ({height}, {width}) -> ({resize_height}, {resize_width})")
+            init_image = transforms.functional.resize(init_image, (resize_height, resize_width), transforms.InterpolationMode.LANCZOS)
+            mask_image = transforms.functional.resize(mask_image, (resize_height, resize_width), transforms.InterpolationMode.LANCZOS)
+        if resize_height != new_height or resize_width != new_width:
+            ia_logging.info(f"center_crop: ({resize_height}, {resize_width}) -> ({new_height}, {new_width})")
+            init_image = transforms.functional.center_crop(init_image, (new_height, new_width))
+            mask_image = transforms.functional.center_crop(mask_image, (new_height, new_width))
+    return init_image, mask_image
+@clear_cache_decorator
+def run_inpaint(input_image, sel_mask, prompt, n_prompt, ddim_steps, cfg_scale, seed, inp_model_id, save_mask_chk, composite_chk,
+                sampler_name="DDIM", iteration_count=1):
+    global sam_dict
+    if input_image is None or sam_dict["mask_image"] is None or sel_mask is None:
+        ia_logging.error("The image or mask does not exist")
+        return
+    mask_image = sam_dict["mask_image"]
+    if input_image.shape != mask_image.shape:
+        ia_logging.error("The sizes of the image and mask do not match")
+        return
+    set_ia_config(IAConfig.KEYS.INP_MODEL_ID, inp_model_id, IAConfig.SECTIONS.USER)
+    save_mask_image(mask_image, save_mask_chk)
+    ia_logging.info(f"Loading model {inp_model_id}")
+    config_offline_inpainting = IAConfig.global_args.get("offline", False)
+    if config_offline_inpainting:
+        ia_logging.info("Run Inpainting on offline network: {}".format(str(config_offline_inpainting)))
+    local_files_only = False
+    local_file_status = download_model_from_hf(inp_model_id, local_files_only=True)
+    if local_file_status != IAFileManager.DOWNLOAD_COMPLETE:
+        if config_offline_inpainting:
+            ia_logging.warning(local_file_status)
+            return
+    else:
+        local_files_only = True
+        ia_logging.info("local_files_only: {}".format(str(local_files_only)))
+    if platform.system() == "Darwin" or devices.device == devices.cpu or ia_check_versions.torch_on_amd_rocm:
+        torch_dtype = torch.float32
+    else:
+        torch_dtype = torch.float16
+    try:
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            inp_model_id, torch_dtype=torch_dtype, local_files_only=local_files_only, use_safetensors=True)
+    except Exception as e:
+        ia_logging.error(str(e))
+        if not config_offline_inpainting:
+            try:
+                pipe = StableDiffusionInpaintPipeline.from_pretrained(
+                    inp_model_id, torch_dtype=torch_dtype, use_safetensors=True)
+            except Exception as e:
+                ia_logging.error(str(e))
+                try:
+                    pipe = StableDiffusionInpaintPipeline.from_pretrained(
+                        inp_model_id, torch_dtype=torch_dtype, force_download=True, use_safetensors=True)
+                except Exception as e:
+                    ia_logging.error(str(e))
+                    return
+        else:
+            return
+    pipe.safety_checker = None
+    ia_logging.info(f"Using sampler {sampler_name}")
+    if sampler_name == "DDIM":
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+    elif sampler_name == "Euler":
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+    elif sampler_name == "Euler a":
+        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+    elif sampler_name == "DPM2 Karras":
+        pipe.scheduler = KDPM2DiscreteScheduler.from_config(pipe.scheduler.config)
+    elif sampler_name == "DPM2 a Karras":
+        pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+    else:
+        ia_logging.info("Sampler fallback to DDIM")
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+    if platform.system() == "Darwin":
+        pipe = pipe.to("mps" if ia_check_versions.torch_mps_is_available else "cpu")
+        pipe.enable_attention_slicing()
+        torch_generator = torch.Generator(devices.cpu)
+    else:
+        if ia_check_versions.diffusers_enable_cpu_offload and devices.device != devices.cpu:
+            ia_logging.info("Enable model cpu offload")
+            pipe.enable_model_cpu_offload()
+        else:
+            pipe = pipe.to(devices.device)
+        if xformers_available:
+            ia_logging.info("Enable xformers memory efficient attention")
+            pipe.enable_xformers_memory_efficient_attention()
+        else:
+            ia_logging.info("Enable attention slicing")
+            pipe.enable_attention_slicing()
+        if "privateuseone" in str(getattr(devices.device, "type", "")):
+            torch_generator = torch.Generator(devices.cpu)
+        else:
+            torch_generator = torch.Generator(devices.device)
+    init_image, mask_image = auto_resize_to_pil(input_image, mask_image)
+    width, height = init_image.size
+    output_list = []
+    iteration_count = iteration_count if iteration_count is not None else 1
+    for count in range(int(iteration_count)):
+        gc.collect()
+        if seed < 0 or count > 0:
+            seed = random.randint(0, 2147483647)
+        generator = torch_generator.manual_seed(seed)
+        pipe_args_dict = {
+            "prompt": prompt,
+            "image": init_image,
+            "width": width,
+            "height": height,
+            "mask_image": mask_image,
+            "num_inference_steps": ddim_steps,
+            "guidance_scale": cfg_scale,
+            "negative_prompt": n_prompt,
+            "generator": generator,
+        }
+        output_image = pipe(**pipe_args_dict).images[0]
+        if composite_chk:
+            dilate_mask_image = Image.fromarray(cv2.dilate(np.array(mask_image), np.ones((3, 3), dtype=np.uint8), iterations=4))
+            output_image = Image.composite(output_image, init_image, dilate_mask_image.convert("L").filter(ImageFilter.GaussianBlur(3)))
+        generation_params = {
+            "Steps": ddim_steps,
+            "Sampler": sampler_name,
+            "CFG scale": cfg_scale,
+            "Seed": seed,
+            "Size": f"{width}x{height}",
+            "Model": inp_model_id,
+        }
+        generation_params_text = ", ".join([k if k == v else f"{k}: {v}" for k, v in generation_params.items() if v is not None])
+        prompt_text = prompt if prompt else ""
+        negative_prompt_text = "\nNegative prompt: " + n_prompt if n_prompt else ""
+        infotext = f"{prompt_text}{negative_prompt_text}\n{generation_params_text}".strip()
+        metadata = PngInfo()
+        metadata.add_text("parameters", infotext)
+        save_name = "_".join([ia_file_manager.savename_prefix, os.path.basename(inp_model_id), str(seed)]) + ".png"
+        save_name = os.path.join(ia_file_manager.outputs_dir, save_name)
+        output_image.save(save_name, pnginfo=metadata)
+        output_list.append(output_image)
+        yield output_list, max([1, iteration_count - (count + 1)])
+@clear_cache_decorator
+def run_cleaner(input_image, sel_mask, cleaner_model_id, cleaner_save_mask_chk):
+    global sam_dict
+    if input_image is None or sam_dict["mask_image"] is None or sel_mask is None:
+        ia_logging.error("The image or mask does not exist")
+        return None
+    mask_image = sam_dict["mask_image"]
+    if input_image.shape != mask_image.shape:
+        ia_logging.error("The sizes of the image and mask do not match")
+        return None
+    save_mask_image(mask_image, cleaner_save_mask_chk)
+    ia_logging.info(f"Loading model {cleaner_model_id}")
+    if platform.system() == "Darwin":
+        model = ModelManager(name=cleaner_model_id, device=devices.cpu)
+    else:
+        model = ModelManager(name=cleaner_model_id, device=devices.device)
+    init_image, mask_image = auto_resize_to_pil(input_image, mask_image)
+    width, height = init_image.size
+    init_image = np.array(init_image)
+    mask_image = np.array(mask_image.convert("L"))
+    config = Config(
+        ldm_steps=20,
+        ldm_sampler=LDMSampler.ddim,
+        hd_strategy=HDStrategy.ORIGINAL,
+        hd_strategy_crop_margin=32,
+        hd_strategy_crop_trigger_size=512,
+        hd_strategy_resize_limit=512,
+        prompt="",
+        sd_steps=20,
+        sd_sampler=SDSampler.ddim
+    )
+    output_image = model(image=init_image, mask=mask_image, config=config)
+    output_image = cv2.cvtColor(output_image.astype(np.uint8), cv2.COLOR_BGR2RGB)
+    output_image = Image.fromarray(output_image)
+    save_name = "_".join([ia_file_manager.savename_prefix, os.path.basename(cleaner_model_id)]) + ".png"
+    save_name = os.path.join(ia_file_manager.outputs_dir, save_name)
+    output_image.save(save_name)
+    del model
+    return [output_image]
+@clear_cache_decorator
+def run_get_alpha_image(input_image, sel_mask):
+    global sam_dict
+    if input_image is None or sam_dict["mask_image"] is None or sel_mask is None:
+        ia_logging.error("The image or mask does not exist")
+        return None, ""
+    mask_image = sam_dict["mask_image"]
+    if input_image.shape != mask_image.shape:
+        ia_logging.error("The sizes of the image and mask do not match")
+        return None, ""
+    alpha_image = Image.fromarray(input_image).convert("RGBA")
+    mask_image = Image.fromarray(mask_image).convert("L")
+    alpha_image.putalpha(mask_image)
+    save_name = "_".join([ia_file_manager.savename_prefix, "rgba_image"]) + ".png"
+    save_name = os.path.join(ia_file_manager.outputs_dir, save_name)
+    alpha_image.save(save_name)
+    return alpha_image, f"saved: {save_name}"
+@clear_cache_decorator
+def run_get_mask(sel_mask):
+    global sam_dict
+    if sam_dict["mask_image"] is None or sel_mask is None:
+        return None
+    mask_image = sam_dict["mask_image"]
+    save_name = "_".join([ia_file_manager.savename_prefix, "created_mask"]) + ".png"
+    save_name = os.path.join(ia_file_manager.outputs_dir, save_name)
+    Image.fromarray(mask_image).save(save_name)
+    return mask_image
+def on_ui_tabs():
+    setup_ia_config_ini()
+    sampler_names = get_sampler_names()
+    sam_model_ids = get_sam_model_ids()
+    sam_model_index = get_ia_config_index(IAConfig.KEYS.SAM_MODEL_ID, IAConfig.SECTIONS.USER)
+    inp_model_ids = get_inp_model_ids()
+    inp_model_index = get_ia_config_index(IAConfig.KEYS.INP_MODEL_ID, IAConfig.SECTIONS.USER)
+    cleaner_model_ids = get_cleaner_model_ids()
+    padding_mode_names = get_padding_mode_names()
+    out_gallery_kwargs = dict(columns=2, height=520, object_fit="contain", preview=True)
+    block = gr.Blocks(analytics_enabled=False).queue()
+    block.title = "Inpaint Anything"
+    with block as inpaint_anything_interface:
+        with gr.Row():
+            gr.Markdown("## Inpainting with Segment Anything")
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        sam_model_id = gr.Dropdown(label="Segment Anything Model ID", elem_id="sam_model_id", choices=sam_model_ids,
+                                                   value=sam_model_ids[sam_model_index], show_label=True)
+                    with gr.Column():
+                        with gr.Row():
+                            load_model_btn = gr.Button("Download model", elem_id="load_model_btn")
+                        with gr.Row():
+                            status_text = gr.Textbox(label="", elem_id="status_text", max_lines=1, show_label=False, interactive=False)
+                with gr.Row():
+                    input_image = gr.Image(label="Input image", elem_id="ia_input_image", source="upload", type="numpy", interactive=True)
+                with gr.Row():
+                    with gr.Accordion("Padding options", elem_id="padding_options", open=False):
+                        with gr.Row():
+                            with gr.Column():
+                                pad_scale_width = gr.Slider(label="Scale Width", elem_id="pad_scale_width", minimum=1.0, maximum=1.5, value=1.0, step=0.01)
+                            with gr.Column():
+                                pad_lr_barance = gr.Slider(label="Left/Right Balance", elem_id="pad_lr_barance", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
+                        with gr.Row():
+                            with gr.Column():
+                                pad_scale_height = gr.Slider(label="Scale Height", elem_id="pad_scale_height", minimum=1.0, maximum=1.5, value=1.0, step=0.01)
+                            with gr.Column():
+                                pad_tb_barance = gr.Slider(label="Top/Bottom Balance", elem_id="pad_tb_barance", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
+                        with gr.Row():
+                            with gr.Column():
+                                padding_mode = gr.Dropdown(label="Padding Mode", elem_id="padding_mode", choices=padding_mode_names, value="edge")
+                            with gr.Column():
+                                padding_btn = gr.Button("Run Padding", elem_id="padding_btn")
+                with gr.Row():
+                    with gr.Column():
+                        anime_style_chk = gr.Checkbox(label="Anime Style (Up Detection, Down mask Quality)", elem_id="anime_style_chk",
+                                                      show_label=True, interactive=True)
+                    with gr.Column():
+                        sam_btn = gr.Button("Run Segment Anything", elem_id="sam_btn", variant="primary", interactive=False)
+                with gr.Tab("Inpainting", elem_id="inpainting_tab"):
+                    prompt = gr.Textbox(label="Inpainting Prompt", elem_id="sd_prompt")
+                    n_prompt = gr.Textbox(label="Negative Prompt", elem_id="sd_n_prompt")
+                    with gr.Accordion("Advanced options", elem_id="inp_advanced_options", open=False):
+                        composite_chk = gr.Checkbox(label="Mask area Only", elem_id="composite_chk", value=True, show_label=True, interactive=True)
+                        with gr.Row():
+                            with gr.Column():
+                                sampler_name = gr.Dropdown(label="Sampler", elem_id="sampler_name", choices=sampler_names,
+                                                           value=sampler_names[0], show_label=True)
+                            with gr.Column():
+                                ddim_steps = gr.Slider(label="Sampling Steps", elem_id="ddim_steps", minimum=1, maximum=100, value=20, step=1)
+                        cfg_scale = gr.Slider(label="Guidance Scale", elem_id="cfg_scale", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
+                        seed = gr.Slider(
+                            label="Seed",
+                            elem_id="sd_seed",
+                            minimum=-1,
+                            maximum=2147483647,
+                            step=1,
+                            value=-1,
+                        )
+                    with gr.Row():
+                        with gr.Column():
+                            inp_model_id = gr.Dropdown(label="Inpainting Model ID", elem_id="inp_model_id",
+                                                       choices=inp_model_ids, value=inp_model_ids[inp_model_index], show_label=True)
+                        with gr.Column():
+                            with gr.Row():
+                                inpaint_btn = gr.Button("Run Inpainting", elem_id="inpaint_btn", variant="primary")
+                            with gr.Row():
+                                save_mask_chk = gr.Checkbox(label="Save mask", elem_id="save_mask_chk",
+                                                            value=False, show_label=False, interactive=False, visible=False)
+                                iteration_count = gr.Slider(label="Iterations", elem_id="iteration_count", minimum=1, maximum=10, value=1, step=1)
+                    with gr.Row():
+                        if ia_check_versions.gradio_version_is_old:
+                            out_image = gr.Gallery(label="Inpainted image", elem_id="ia_out_image", show_label=False
+                                                   ).style(**out_gallery_kwargs)
+                        else:
+                            out_image = gr.Gallery(label="Inpainted image", elem_id="ia_out_image", show_label=False,
+                                                   **out_gallery_kwargs)
+                with gr.Tab("Cleaner", elem_id="cleaner_tab"):
+                    with gr.Row():
+                        with gr.Column():
+                            cleaner_model_id = gr.Dropdown(label="Cleaner Model ID", elem_id="cleaner_model_id",
+                                                           choices=cleaner_model_ids, value=cleaner_model_ids[0], show_label=True)
+                        with gr.Column():
+                            with gr.Row():
+                                cleaner_btn = gr.Button("Run Cleaner", elem_id="cleaner_btn", variant="primary")
+                            with gr.Row():
+                                cleaner_save_mask_chk = gr.Checkbox(label="Save mask", elem_id="cleaner_save_mask_chk",
+                                                                    value=False, show_label=False, interactive=False, visible=False)
+                    with gr.Row():
+                        if ia_check_versions.gradio_version_is_old:
+                            cleaner_out_image = gr.Gallery(label="Cleaned image", elem_id="ia_cleaner_out_image", show_label=False
+                                                           ).style(**out_gallery_kwargs)
+                        else:
+                            cleaner_out_image = gr.Gallery(label="Cleaned image", elem_id="ia_cleaner_out_image", show_label=False,
+                                                           **out_gallery_kwargs)
+                with gr.Tab("Mask only", elem_id="mask_only_tab"):
+                    with gr.Row():
+                        with gr.Column():
+                            get_alpha_image_btn = gr.Button("Get mask as alpha of image", elem_id="get_alpha_image_btn")
+                        with gr.Column():
+                            get_mask_btn = gr.Button("Get mask", elem_id="get_mask_btn")
+                    with gr.Row():
+                        with gr.Column():
+                            alpha_out_image = gr.Image(label="Alpha channel image", elem_id="alpha_out_image", type="pil", image_mode="RGBA", interactive=False)
+                        with gr.Column():
+                            mask_out_image = gr.Image(label="Mask image", elem_id="mask_out_image", type="numpy", interactive=False)
+                    with gr.Row():
+                        with gr.Column():
+                            get_alpha_status_text = gr.Textbox(label="", elem_id="get_alpha_status_text", max_lines=1, show_label=False, interactive=False)
+                        with gr.Column():
+                            gr.Markdown("")
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown("Mouse over image: Press `S` key for Fullscreen mode, `R` key to Reset zoom")
+                with gr.Row():
+                    if ia_check_versions.gradio_version_is_old:
+                        sam_image = gr.Image(label="Segment Anything image", elem_id="ia_sam_image", type="numpy", tool="sketch", brush_radius=8,
+                                             show_label=False, interactive=True).style(height=480)
+                    else:
+                        sam_image = gr.Image(label="Segment Anything image", elem_id="ia_sam_image", type="numpy", tool="sketch", brush_radius=8,
+                                             show_label=False, interactive=True, height=480)
+                with gr.Row():
+                    with gr.Column():
+                        select_btn = gr.Button("Create Mask", elem_id="select_btn", variant="primary")
+                    with gr.Column():
+                        with gr.Row():
+                            invert_chk = gr.Checkbox(label="Invert mask", elem_id="invert_chk", show_label=True, interactive=True)
+                            ignore_black_chk = gr.Checkbox(label="Ignore black area", elem_id="ignore_black_chk", value=True, show_label=True, interactive=True)
+                with gr.Row():
+                    if ia_check_versions.gradio_version_is_old:
+                        sel_mask = gr.Image(label="Selected mask image", elem_id="ia_sel_mask", type="numpy", tool="sketch", brush_radius=12,
+                                            show_label=False, interactive=True).style(height=480)
+                    else:
+                        sel_mask = gr.Image(label="Selected mask image", elem_id="ia_sel_mask", type="numpy", tool="sketch", brush_radius=12,
+                                            show_label=False, interactive=True, height=480)
+                with gr.Row():
+                    with gr.Column():
+                        expand_mask_btn = gr.Button("Expand mask region", elem_id="expand_mask_btn")
+                        expand_mask_iteration_count = gr.Slider(label="Expand Mask Iterations",
+                                                                elem_id="expand_mask_iteration_count", minimum=1, maximum=100, value=1, step=1)
+                    with gr.Column():
+                        apply_mask_btn = gr.Button("Trim mask by sketch", elem_id="apply_mask_btn")
+                        add_mask_btn = gr.Button("Add mask by sketch", elem_id="add_mask_btn")
+            load_model_btn.click(download_model, inputs=[sam_model_id], outputs=[status_text])
+            input_image.upload(input_image_upload, inputs=[input_image, sam_image, sel_mask], outputs=[sam_image, sel_mask, sam_btn]).then(
+                fn=None, inputs=None, outputs=None, _js="inpaintAnything_initSamSelMask")
+            padding_btn.click(run_padding, inputs=[input_image, pad_scale_width, pad_scale_height, pad_lr_barance, pad_tb_barance, padding_mode],
+                              outputs=[input_image, status_text])
+            sam_btn.click(run_sam, inputs=[input_image, sam_model_id, sam_image, anime_style_chk], outputs=[sam_image, status_text]).then(
+                fn=None, inputs=None, outputs=None, _js="inpaintAnything_clearSamMask")
+            select_btn.click(select_mask, inputs=[input_image, sam_image, invert_chk, ignore_black_chk, sel_mask], outputs=[sel_mask]).then(
+                fn=None, inputs=None, outputs=None, _js="inpaintAnything_clearSelMask")
+            expand_mask_btn.click(expand_mask, inputs=[input_image, sel_mask, expand_mask_iteration_count], outputs=[sel_mask]).then(
+                fn=None, inputs=None, outputs=None, _js="inpaintAnything_clearSelMask")
+            apply_mask_btn.click(apply_mask, inputs=[input_image, sel_mask], outputs=[sel_mask]).then(
+                fn=None, inputs=None, outputs=None, _js="inpaintAnything_clearSelMask")
+            add_mask_btn.click(add_mask, inputs=[input_image, sel_mask], outputs=[sel_mask]).then(
+                fn=None, inputs=None, outputs=None, _js="inpaintAnything_clearSelMask")
+            inpaint_btn.click(
+                run_inpaint,
+                inputs=[input_image, sel_mask, prompt, n_prompt, ddim_steps, cfg_scale, seed, inp_model_id, save_mask_chk, composite_chk,
+                        sampler_name, iteration_count],
+                outputs=[out_image, iteration_count])
+            cleaner_btn.click(
+                run_cleaner,
+                inputs=[input_image, sel_mask, cleaner_model_id, cleaner_save_mask_chk],
+                outputs=[cleaner_out_image])
+            get_alpha_image_btn.click(
+                run_get_alpha_image,
+                inputs=[input_image, sel_mask],
+                outputs=[alpha_out_image, get_alpha_status_text])
+            get_mask_btn.click(
+                run_get_mask,
+                inputs=[sel_mask],
+                outputs=[mask_out_image])
+    return [(inpaint_anything_interface, "Inpaint Anything", "inpaint_anything")]
+block, _, _ = on_ui_tabs()[0]
+block.launch(share=True)

images/inpaint_anything_explanation_image_1.png ADDED Viewed

images/inpaint_anything_ui_image_1.png ADDED Viewed

images/sample_input_image.png ADDED Viewed

images/sample_mask_image.png ADDED Viewed

images/sample_seg_color_image.png ADDED Viewed

inpalib/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .masklib import create_mask_image, invert_mask
+from .samlib import (create_seg_color_image, generate_sam_masks, get_all_sam_ids,
+                     get_available_sam_ids, get_seg_colormap, insert_mask_to_sam_masks,
+                     sam_file_exists, sam_file_path, sort_masks_by_area)
+__all__ = [
+    "create_mask_image",
+    "invert_mask",
+    "create_seg_color_image",
+    "generate_sam_masks",
+    "get_all_sam_ids",
+    "get_available_sam_ids",
+    "get_seg_colormap",
+    "insert_mask_to_sam_masks",
+    "sam_file_exists",
+    "sam_file_path",
+    "sort_masks_by_area",
+]

inpalib/masklib.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from typing import Any, Dict, List, Union
+import numpy as np
+from PIL import Image
+def invert_mask(mask: np.ndarray) -> np.ndarray:
+    """Invert mask.
+    Args:
+        mask (np.ndarray): mask
+    Returns:
+        np.ndarray: inverted mask
+    """
+    if mask is None or not isinstance(mask, np.ndarray):
+        raise ValueError("Invalid mask")
+    # return np.logical_not(mask.astype(bool)).astype(np.uint8) * 255
+    return np.invert(mask.astype(np.uint8))
+def check_inputs_create_mask_image(
+        mask: Union[np.ndarray, Image.Image],
+        sam_masks: List[Dict[str, Any]],
+        ignore_black_chk: bool = True,
+) -> None:
+    """Check create mask image inputs.
+    Args:
+        mask (Union[np.ndarray, Image.Image]): mask
+        sam_masks (List[Dict[str, Any]]): SAM masks
+        ignore_black_chk (bool): ignore black check
+    Returns:
+        None
+    """
+    if mask is None or not isinstance(mask, (np.ndarray, Image.Image)):
+        raise ValueError("Invalid mask")
+    if sam_masks is None or not isinstance(sam_masks, list):
+        raise ValueError("Invalid SAM masks")
+    if ignore_black_chk is None or not isinstance(ignore_black_chk, bool):
+        raise ValueError("Invalid ignore black check")
+def convert_mask(mask: Union[np.ndarray, Image.Image]) -> np.ndarray:
+    """Convert mask.
+    Args:
+        mask (Union[np.ndarray, Image.Image]): mask
+    Returns:
+        np.ndarray: converted mask
+    """
+    if isinstance(mask, Image.Image):
+        mask = np.array(mask)
+    if mask.ndim == 2:
+        mask = mask[:, :, np.newaxis]
+    if mask.shape[2] != 1:
+        mask = mask[:, :, 0:1]
+    return mask
+def create_mask_image(
+        mask: Union[np.ndarray, Image.Image],
+        sam_masks: List[Dict[str, Any]],
+        ignore_black_chk: bool = True,
+) -> np.ndarray:
+    """Create mask image.
+    Args:
+        mask (Union[np.ndarray, Image.Image]): mask
+        sam_masks (List[Dict[str, Any]]): SAM masks
+        ignore_black_chk (bool): ignore black check
+    Returns:
+        np.ndarray: mask image
+    """
+    check_inputs_create_mask_image(mask, sam_masks, ignore_black_chk)
+    mask = convert_mask(mask)
+    canvas_image = np.zeros(mask.shape, dtype=np.uint8)
+    mask_region = np.zeros(mask.shape, dtype=np.uint8)
+    for seg_dict in sam_masks:
+        seg_mask = np.expand_dims(seg_dict["segmentation"].astype(np.uint8), axis=-1)
+        canvas_mask = np.logical_not(canvas_image.astype(bool)).astype(np.uint8)
+        if (seg_mask * canvas_mask * mask).astype(bool).any():
+            mask_region = mask_region + (seg_mask * canvas_mask)
+        seg_color = seg_mask * canvas_mask
+        canvas_image = canvas_image + seg_color
+    if not ignore_black_chk:
+        canvas_mask = np.logical_not(canvas_image.astype(bool)).astype(np.uint8)
+        if (canvas_mask * mask).astype(bool).any():
+            mask_region = mask_region + (canvas_mask)
+    mask_region = np.tile(mask_region * 255, (1, 1, 3))
+    seg_image = mask_region.astype(np.uint8)
+    return seg_image

inpalib/samlib.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import copy
+import os
+import sys
+from typing import Any, Dict, List, Union
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+inpa_basedir = os.path.normpath(os.path.join(os.path.dirname(__file__), ".."))
+if inpa_basedir not in sys.path:
+    sys.path.append(inpa_basedir)
+from ia_file_manager import ia_file_manager  # noqa: E402
+from ia_get_dataset_colormap import create_pascal_label_colormap  # noqa: E402
+from ia_logging import ia_logging  # noqa: E402
+from ia_sam_manager import check_bfloat16_support, get_sam_mask_generator  # noqa: E402
+from ia_ui_items import get_sam_model_ids  # noqa: E402
+def get_all_sam_ids() -> List[str]:
+    """Get all SAM IDs.
+    Returns:
+        List[str]: SAM IDs
+    """
+    return get_sam_model_ids()
+def sam_file_path(sam_id: str) -> str:
+    """Get SAM file path.
+    Args:
+        sam_id (str): SAM ID
+    Returns:
+        str: SAM file path
+    """
+    return os.path.join(ia_file_manager.models_dir, sam_id)
+def sam_file_exists(sam_id: str) -> bool:
+    """Check if SAM file exists.
+    Args:
+        sam_id (str): SAM ID
+    Returns:
+        bool: True if SAM file exists else False
+    """
+    sam_checkpoint = sam_file_path(sam_id)
+    return os.path.isfile(sam_checkpoint)
+def get_available_sam_ids() -> List[str]:
+    """Get available SAM IDs.
+    Returns:
+        List[str]: available SAM IDs
+    """
+    all_sam_ids = get_all_sam_ids()
+    for sam_id in all_sam_ids.copy():
+        if not sam_file_exists(sam_id):
+            all_sam_ids.remove(sam_id)
+    return all_sam_ids
+def check_inputs_generate_sam_masks(
+        input_image: Union[np.ndarray, Image.Image],
+        sam_id: str,
+        anime_style_chk: bool = False,
+) -> None:
+    """Check generate SAM masks inputs.
+    Args:
+        input_image (Union[np.ndarray, Image.Image]): input image
+        sam_id (str): SAM ID
+        anime_style_chk (bool): anime style check
+    Returns:
+        None
+    """
+    if input_image is None or not isinstance(input_image, (np.ndarray, Image.Image)):
+        raise ValueError("Invalid input image")
+    if sam_id is None or not isinstance(sam_id, str):
+        raise ValueError("Invalid SAM ID")
+    if anime_style_chk is None or not isinstance(anime_style_chk, bool):
+        raise ValueError("Invalid anime style check")
+def convert_input_image(input_image: Union[np.ndarray, Image.Image]) -> np.ndarray:
+    """Convert input image.
+    Args:
+        input_image (Union[np.ndarray, Image.Image]): input image
+    Returns:
+        np.ndarray: converted input image
+    """
+    if isinstance(input_image, Image.Image):
+        input_image = np.array(input_image)
+    if input_image.ndim == 2:
+        input_image = input_image[:, :, np.newaxis]
+    if input_image.shape[2] == 1:
+        input_image = np.concatenate([input_image] * 3, axis=-1)
+    return input_image
+def generate_sam_masks(
+        input_image: Union[np.ndarray, Image.Image],
+        sam_id: str,
+        anime_style_chk: bool = False,
+) -> List[Dict[str, Any]]:
+    """Generate SAM masks.
+    Args:
+        input_image (Union[np.ndarray, Image.Image]): input image
+        sam_id (str): SAM ID
+        anime_style_chk (bool): anime style check
+    Returns:
+        List[Dict[str, Any]]: SAM masks
+    """
+    check_inputs_generate_sam_masks(input_image, sam_id, anime_style_chk)
+    input_image = convert_input_image(input_image)
+    sam_checkpoint = sam_file_path(sam_id)
+    sam_mask_generator = get_sam_mask_generator(sam_checkpoint, anime_style_chk)
+    ia_logging.info(f"{sam_mask_generator.__class__.__name__} {sam_id}")
+    if "sam2_" in sam_id:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        torch_dtype = torch.bfloat16 if check_bfloat16_support() else torch.float16
+        with torch.inference_mode(), torch.autocast(device, dtype=torch_dtype):
+            sam_masks = sam_mask_generator.generate(input_image)
+    else:
+        sam_masks = sam_mask_generator.generate(input_image)
+    if anime_style_chk:
+        for sam_mask in sam_masks:
+            sam_mask_seg = sam_mask["segmentation"]
+            sam_mask_seg = cv2.morphologyEx(sam_mask_seg.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8))
+            sam_mask_seg = cv2.morphologyEx(sam_mask_seg.astype(np.uint8), cv2.MORPH_OPEN, np.ones((5, 5), np.uint8))
+            sam_mask["segmentation"] = sam_mask_seg.astype(bool)
+    ia_logging.info("sam_masks: {}".format(len(sam_masks)))
+    sam_masks = copy.deepcopy(sam_masks)
+    return sam_masks
+def sort_masks_by_area(
+        sam_masks: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Sort mask by area.
+    Args:
+        sam_masks (List[Dict[str, Any]]): SAM masks
+    Returns:
+        List[Dict[str, Any]]: sorted SAM masks
+    """
+    return sorted(sam_masks, key=lambda x: np.sum(x.get("segmentation").astype(np.uint32)))
+def get_seg_colormap() -> np.ndarray:
+    """Get segmentation colormap.
+    Returns:
+        np.ndarray: segmentation colormap
+    """
+    cm_pascal = create_pascal_label_colormap()
+    seg_colormap = cm_pascal
+    seg_colormap = np.array([c for c in seg_colormap if max(c) >= 64], dtype=np.uint8)
+    return seg_colormap
+def insert_mask_to_sam_masks(
+        sam_masks: List[Dict[str, Any]],
+        insert_mask: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    """Insert mask to SAM masks.
+    Args:
+        sam_masks (List[Dict[str, Any]]): SAM masks
+        insert_mask (Dict[str, Any]): insert mask
+    Returns:
+        List[Dict[str, Any]]: SAM masks
+    """
+    if insert_mask is not None and isinstance(insert_mask, dict) and "segmentation" in insert_mask:
+        if (len(sam_masks) > 0 and
+                sam_masks[0]["segmentation"].shape == insert_mask["segmentation"].shape and
+                np.any(insert_mask["segmentation"])):
+            sam_masks.insert(0, insert_mask)
+            ia_logging.info("insert mask to sam_masks")
+    return sam_masks
+def create_seg_color_image(
+        input_image: Union[np.ndarray, Image.Image],
+        sam_masks: List[Dict[str, Any]],
+) -> np.ndarray:
+    """Create segmentation color image.
+    Args:
+        input_image (Union[np.ndarray, Image.Image]): input image
+        sam_masks (List[Dict[str, Any]]): SAM masks
+    Returns:
+        np.ndarray: segmentation color image
+    """
+    input_image = convert_input_image(input_image)
+    seg_colormap = get_seg_colormap()
+    sam_masks = sam_masks[:len(seg_colormap)]
+    with tqdm(total=len(sam_masks), desc="Processing segments") as progress_bar:
+        canvas_image = np.zeros((*input_image.shape[:2], 1), dtype=np.uint8)
+        for idx, seg_dict in enumerate(sam_masks[0:min(255, len(sam_masks))]):
+            seg_mask = np.expand_dims(seg_dict["segmentation"].astype(np.uint8), axis=-1)
+            canvas_mask = np.logical_not(canvas_image.astype(bool)).astype(np.uint8)
+            seg_color = np.array([idx+1], dtype=np.uint8) * seg_mask * canvas_mask
+            canvas_image = canvas_image + seg_color
+            progress_bar.update(1)
+        seg_colormap = np.insert(seg_colormap, 0, [0, 0, 0], axis=0)
+        temp_canvas_image = np.apply_along_axis(lambda x: seg_colormap[x[0]], axis=-1, arr=canvas_image)
+        if len(sam_masks) > 255:
+            canvas_image = canvas_image.astype(bool).astype(np.uint8)
+            for idx, seg_dict in enumerate(sam_masks[255:min(509, len(sam_masks))]):
+                seg_mask = np.expand_dims(seg_dict["segmentation"].astype(np.uint8), axis=-1)
+                canvas_mask = np.logical_not(canvas_image.astype(bool)).astype(np.uint8)
+                seg_color = np.array([idx+2], dtype=np.uint8) * seg_mask * canvas_mask
+                canvas_image = canvas_image + seg_color
+                progress_bar.update(1)
+            seg_colormap = seg_colormap[256:]
+            seg_colormap = np.insert(seg_colormap, 0, [0, 0, 0], axis=0)
+            seg_colormap = np.insert(seg_colormap, 0, [0, 0, 0], axis=0)
+            canvas_image = np.apply_along_axis(lambda x: seg_colormap[x[0]], axis=-1, arr=canvas_image)
+            canvas_image = temp_canvas_image + canvas_image
+        else:
+            canvas_image = temp_canvas_image
+    ret_seg_image = canvas_image.astype(np.uint8)
+    return ret_seg_image

javascript/inpaint-anything.js ADDED Viewed

	@@ -0,0 +1,458 @@

+const inpaintAnything_waitForElement = async (parent, selector, exist) => {
+    return new Promise((resolve) => {
+        const observer = new MutationObserver(() => {
+            if (!!parent.querySelector(selector) != exist) {
+                return;
+            }
+            observer.disconnect();
+            resolve(undefined);
+        });
+        observer.observe(parent, {
+            childList: true,
+            subtree: true,
+        });
+        if (!!parent.querySelector(selector) == exist) {
+            resolve(undefined);
+        }
+    });
+};
+const inpaintAnything_waitForStyle = async (parent, selector, style) => {
+    return new Promise((resolve) => {
+        const observer = new MutationObserver(() => {
+            if (!parent.querySelector(selector) || !parent.querySelector(selector).style[style]) {
+                return;
+            }
+            observer.disconnect();
+            resolve(undefined);
+        });
+        observer.observe(parent, {
+            childList: true,
+            subtree: true,
+            attributes: true,
+            attributeFilter: ["style"],
+        });
+        if (!!parent.querySelector(selector) && !!parent.querySelector(selector).style[style]) {
+            resolve(undefined);
+        }
+    });
+};
+const inpaintAnything_timeout = (ms) => {
+    return new Promise(function (resolve, reject) {
+        setTimeout(() => reject("Timeout"), ms);
+    });
+};
+async function inpaintAnything_clearSamMask() {
+    const waitForElementToBeInDocument = (parent, selector) =>
+        Promise.race([inpaintAnything_waitForElement(parent, selector, true), inpaintAnything_timeout(1000)]);
+    const elemId = "#ia_sam_image";
+    const targetElement = document.querySelector(elemId);
+    if (!targetElement) {
+        return;
+    }
+    await waitForElementToBeInDocument(targetElement, "button[aria-label='Clear']");
+    targetElement.style.transform = null;
+    targetElement.style.zIndex = null;
+    targetElement.style.overflow = "auto";
+    const samMaskClear = targetElement.querySelector("button[aria-label='Clear']");
+    if (!samMaskClear) {
+        return;
+    }
+    const removeImageButton = targetElement.querySelector("button[aria-label='Remove Image']");
+    if (!removeImageButton) {
+        return;
+    }
+    samMaskClear?.click();
+    if (typeof inpaintAnything_clearSamMask.clickRemoveImage === "undefined") {
+        inpaintAnything_clearSamMask.clickRemoveImage = () => {
+            targetElement.style.transform = null;
+            targetElement.style.zIndex = null;
+        };
+    } else {
+        removeImageButton.removeEventListener("click", inpaintAnything_clearSamMask.clickRemoveImage);
+    }
+    removeImageButton.addEventListener("click", inpaintAnything_clearSamMask.clickRemoveImage);
+}
+async function inpaintAnything_clearSelMask() {
+    const waitForElementToBeInDocument = (parent, selector) =>
+        Promise.race([inpaintAnything_waitForElement(parent, selector, true), inpaintAnything_timeout(1000)]);
+    const elemId = "#ia_sel_mask";
+    const targetElement = document.querySelector(elemId);
+    if (!targetElement) {
+        return;
+    }
+    await waitForElementToBeInDocument(targetElement, "button[aria-label='Clear']");
+    targetElement.style.transform = null;
+    targetElement.style.zIndex = null;
+    targetElement.style.overflow = "auto";
+    const selMaskClear = targetElement.querySelector("button[aria-label='Clear']");
+    if (!selMaskClear) {
+        return;
+    }
+    const removeImageButton = targetElement.querySelector("button[aria-label='Remove Image']");
+    if (!removeImageButton) {
+        return;
+    }
+    selMaskClear?.click();
+    if (typeof inpaintAnything_clearSelMask.clickRemoveImage === "undefined") {
+        inpaintAnything_clearSelMask.clickRemoveImage = () => {
+            targetElement.style.transform = null;
+            targetElement.style.zIndex = null;
+        };
+    } else {
+        removeImageButton.removeEventListener("click", inpaintAnything_clearSelMask.clickRemoveImage);
+    }
+    removeImageButton.addEventListener("click", inpaintAnything_clearSelMask.clickRemoveImage);
+}
+async function inpaintAnything_initSamSelMask() {
+    inpaintAnything_clearSamMask();
+    inpaintAnything_clearSelMask();
+}
+var uiLoadedCallbacks = [];
+function gradioApp() {
+    const elems = document.getElementsByTagName("gradio-app");
+    const elem = elems.length == 0 ? document : elems[0];
+    if (elem !== document) {
+        elem.getElementById = function (id) {
+            return document.getElementById(id);
+        };
+    }
+    return elem.shadowRoot ? elem.shadowRoot : elem;
+}
+function onUiLoaded(callback) {
+    uiLoadedCallbacks.push(callback);
+}
+function executeCallbacks(queue) {
+    for (const callback of queue) {
+        try {
+            callback();
+        } catch (e) {
+            console.error("error running callback", callback, ":", e);
+        }
+    }
+}
+onUiLoaded(async () => {
+    const elementIDs = {
+        ia_sam_image: "#ia_sam_image",
+        ia_sel_mask: "#ia_sel_mask",
+        ia_out_image: "#ia_out_image",
+        ia_cleaner_out_image: "#ia_cleaner_out_image",
+    };
+    function setStyleHeight(elemId, height) {
+        const elem = gradioApp().querySelector(elemId);
+        if (elem) {
+            if (!elem.style.height) {
+                elem.style.height = height;
+                const observer = new MutationObserver(() => {
+                    const divPreview = elem.querySelector(".preview");
+                    if (divPreview) {
+                        divPreview.classList.remove("fixed-height");
+                    }
+                });
+                observer.observe(elem, {
+                    childList: true,
+                    attributes: true,
+                    attributeFilter: ["class"],
+                });
+            }
+        }
+    }
+    setStyleHeight(elementIDs.ia_out_image, "520px");
+    setStyleHeight(elementIDs.ia_cleaner_out_image, "520px");
+    // Default config
+    const defaultHotkeysConfig = {
+        canvas_hotkey_reset: "KeyR",
+        canvas_hotkey_fullscreen: "KeyS",
+    };
+    const elemData = {};
+    let activeElement;
+    function applyZoomAndPan(elemId) {
+        const targetElement = gradioApp().querySelector(elemId);
+        if (!targetElement) {
+            console.log("Element not found");
+            return;
+        }
+        targetElement.style.transformOrigin = "0 0";
+        elemData[elemId] = {
+            zoomLevel: 1,
+            panX: 0,
+            panY: 0,
+        };
+        let fullScreenMode = false;
+        // Toggle the zIndex of the target element between two values, allowing it to overlap or be overlapped by other elements
+        function toggleOverlap(forced = "") {
+            // const zIndex1 = "0";
+            const zIndex1 = null;
+            const zIndex2 = "998";
+            targetElement.style.zIndex = targetElement.style.zIndex !== zIndex2 ? zIndex2 : zIndex1;
+            if (forced === "off") {
+                targetElement.style.zIndex = zIndex1;
+            } else if (forced === "on") {
+                targetElement.style.zIndex = zIndex2;
+            }
+        }
+        /**
+         * This function fits the target element to the screen by calculating
+         * the required scale and offsets. It also updates the global variables
+         * zoomLevel, panX, and panY to reflect the new state.
+         */
+        function fitToElement() {
+            //Reset Zoom
+            targetElement.style.transform = `translate(${0}px, ${0}px) scale(${1})`;
+            // Get element and screen dimensions
+            const elementWidth = targetElement.offsetWidth;
+            const elementHeight = targetElement.offsetHeight;
+            const parentElement = targetElement.parentElement;
+            const screenWidth = parentElement.clientWidth;
+            const screenHeight = parentElement.clientHeight;
+            // Get element's coordinates relative to the parent element
+            const elementRect = targetElement.getBoundingClientRect();
+            const parentRect = parentElement.getBoundingClientRect();
+            const elementX = elementRect.x - parentRect.x;
+            // Calculate scale and offsets
+            const scaleX = screenWidth / elementWidth;
+            const scaleY = screenHeight / elementHeight;
+            const scale = Math.min(scaleX, scaleY);
+            const transformOrigin = window.getComputedStyle(targetElement).transformOrigin;
+            const [originX, originY] = transformOrigin.split(" ");
+            const originXValue = parseFloat(originX);
+            const originYValue = parseFloat(originY);
+            const offsetX = (screenWidth - elementWidth * scale) / 2 - originXValue * (1 - scale);
+            const offsetY = (screenHeight - elementHeight * scale) / 2.5 - originYValue * (1 - scale);
+            // Apply scale and offsets to the element
+            targetElement.style.transform = `translate(${offsetX}px, ${offsetY}px) scale(${scale})`;
+            // Update global variables
+            elemData[elemId].zoomLevel = scale;
+            elemData[elemId].panX = offsetX;
+            elemData[elemId].panY = offsetY;
+            fullScreenMode = false;
+            toggleOverlap("off");
+        }
+        // Reset the zoom level and pan position of the target element to their initial values
+        function resetZoom() {
+            elemData[elemId] = {
+                zoomLevel: 1,
+                panX: 0,
+                panY: 0,
+            };
+            // fixCanvas();
+            targetElement.style.transform = `scale(${elemData[elemId].zoomLevel}) translate(${elemData[elemId].panX}px, ${elemData[elemId].panY}px)`;
+            // const canvas = gradioApp().querySelector(`${elemId} canvas[key="interface"]`);
+            toggleOverlap("off");
+            fullScreenMode = false;
+            // if (
+            //     canvas &&
+            //     parseFloat(canvas.style.width) > 865 &&
+            //     parseFloat(targetElement.style.width) > 865
+            // ) {
+            //     fitToElement();
+            //     return;
+            // }
+            // targetElement.style.width = "";
+            // if (canvas) {
+            //     targetElement.style.height = canvas.style.height;
+            // }
+            targetElement.style.width = null;
+            targetElement.style.height = 480;
+        }
+        /**
+         * This function fits the target element to the screen by calculating
+         * the required scale and offsets. It also updates the global variables
+         * zoomLevel, panX, and panY to reflect the new state.
+         */
+        // Fullscreen mode
+        function fitToScreen() {
+            const canvas = gradioApp().querySelector(`${elemId} canvas[key="interface"]`);
+            const img = gradioApp().querySelector(`${elemId} img`);
+            if (!canvas && !img) return;
+            // if (canvas.offsetWidth > 862) {
+            //     targetElement.style.width = canvas.offsetWidth + "px";
+            // }
+            if (fullScreenMode) {
+                resetZoom();
+                fullScreenMode = false;
+                return;
+            }
+            //Reset Zoom
+            targetElement.style.transform = `translate(${0}px, ${0}px) scale(${1})`;
+            // Get scrollbar width to right-align the image
+            const scrollbarWidth = window.innerWidth - document.documentElement.clientWidth;
+            // Get element and screen dimensions
+            const elementWidth = targetElement.offsetWidth;
+            const elementHeight = targetElement.offsetHeight;
+            const screenWidth = window.innerWidth - scrollbarWidth;
+            const screenHeight = window.innerHeight;
+            // Get element's coordinates relative to the page
+            const elementRect = targetElement.getBoundingClientRect();
+            const elementY = elementRect.y;
+            const elementX = elementRect.x;
+            // Calculate scale and offsets
+            const scaleX = screenWidth / elementWidth;
+            const scaleY = screenHeight / elementHeight;
+            const scale = Math.min(scaleX, scaleY);
+            // Get the current transformOrigin
+            const computedStyle = window.getComputedStyle(targetElement);
+            const transformOrigin = computedStyle.transformOrigin;
+            const [originX, originY] = transformOrigin.split(" ");
+            const originXValue = parseFloat(originX);
+            const originYValue = parseFloat(originY);
+            // Calculate offsets with respect to the transformOrigin
+            const offsetX = (screenWidth - elementWidth * scale) / 2 - elementX - originXValue * (1 - scale);
+            const offsetY = (screenHeight - elementHeight * scale) / 2 - elementY - originYValue * (1 - scale);
+            // Apply scale and offsets to the element
+            targetElement.style.transform = `translate(${offsetX}px, ${offsetY}px) scale(${scale})`;
+            // Update global variables
+            elemData[elemId].zoomLevel = scale;
+            elemData[elemId].panX = offsetX;
+            elemData[elemId].panY = offsetY;
+            fullScreenMode = true;
+            toggleOverlap("on");
+        }
+        // Reset zoom when uploading a new image
+        const fileInput = gradioApp().querySelector(`${elemId} input[type="file"][accept="image/*"].svelte-116rqfv`);
+        if (fileInput) {
+            fileInput.addEventListener("click", resetZoom);
+        }
+        // Handle keydown events
+        function handleKeyDown(event) {
+            // Disable key locks to make pasting from the buffer work correctly
+            if (
+                (event.ctrlKey && event.code === "KeyV") ||
+                (event.ctrlKey && event.code === "KeyC") ||
+                event.code === "F5"
+            ) {
+                return;
+            }
+            // before activating shortcut, ensure user is not actively typing in an input field
+            if (event.target.nodeName === "TEXTAREA" || event.target.nodeName === "INPUT") {
+                return;
+            }
+            const hotkeyActions = {
+                [defaultHotkeysConfig.canvas_hotkey_reset]: resetZoom,
+                [defaultHotkeysConfig.canvas_hotkey_fullscreen]: fitToScreen,
+            };
+            const action = hotkeyActions[event.code];
+            if (action) {
+                event.preventDefault();
+                action(event);
+            }
+        }
+        // Handle events only inside the targetElement
+        let isKeyDownHandlerAttached = false;
+        function handleMouseMove() {
+            if (!isKeyDownHandlerAttached) {
+                document.addEventListener("keydown", handleKeyDown);
+                isKeyDownHandlerAttached = true;
+                activeElement = elemId;
+            }
+        }
+        function handleMouseLeave() {
+            if (isKeyDownHandlerAttached) {
+                document.removeEventListener("keydown", handleKeyDown);
+                isKeyDownHandlerAttached = false;
+                activeElement = null;
+            }
+        }
+        // Add mouse event handlers
+        targetElement.addEventListener("mousemove", handleMouseMove);
+        targetElement.addEventListener("mouseleave", handleMouseLeave);
+    }
+    applyZoomAndPan(elementIDs.ia_sam_image);
+    applyZoomAndPan(elementIDs.ia_sel_mask);
+    // applyZoomAndPan(elementIDs.ia_out_image);
+    // applyZoomAndPan(elementIDs.ia_cleaner_out_image);
+});
+var executedOnLoaded = false;
+document.addEventListener("DOMContentLoaded", function () {
+    var mutationObserver = new MutationObserver(function () {
+        if (
+            !executedOnLoaded &&
+            gradioApp().querySelector("#ia_sam_image") &&
+            gradioApp().querySelector("#ia_sel_mask")
+        ) {
+            executedOnLoaded = true;
+            executeCallbacks(uiLoadedCallbacks);
+        }
+    });
+    mutationObserver.observe(gradioApp(), { childList: true, subtree: true });
+});

lama_cleaner/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+import warnings  # noqa: E402
+warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
+warnings.filterwarnings("ignore", category=UserWarning, module="lama_cleaner")
+from lama_cleaner.parse_args import parse_args  # noqa: E402
+def entry_point():
+    args = parse_args()
+    # To make os.environ["XDG_CACHE_HOME"] = args.model_cache_dir works for diffusers
+    # https://github.com/huggingface/diffusers/blob/be99201a567c1ccd841dc16fb24e88f7f239c187/src/diffusers/utils/constants.py#L18
+    from lama_cleaner.server import main
+    main(args)

lama_cleaner/benchmark.py ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/usr/bin/env python3
+import argparse
+import os
+import time
+import numpy as np
+import nvidia_smi
+import psutil
+import torch
+from lama_cleaner.model_manager import ModelManager
+from lama_cleaner.schema import Config, HDStrategy, SDSampler
+try:
+    torch._C._jit_override_can_fuse_on_cpu(False)
+    torch._C._jit_override_can_fuse_on_gpu(False)
+    torch._C._jit_set_texpr_fuser_enabled(False)
+    torch._C._jit_set_nvfuser_enabled(False)
+except:
+    pass
+NUM_THREADS = str(4)
+os.environ["OMP_NUM_THREADS"] = NUM_THREADS
+os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS
+os.environ["MKL_NUM_THREADS"] = NUM_THREADS
+os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS
+os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS
+if os.environ.get("CACHE_DIR"):
+    os.environ["TORCH_HOME"] = os.environ["CACHE_DIR"]
+def run_model(model, size):
+    # RGB
+    image = np.random.randint(0, 256, (size[0], size[1], 3)).astype(np.uint8)
+    mask = np.random.randint(0, 255, size).astype(np.uint8)
+    config = Config(
+        ldm_steps=2,
+        hd_strategy=HDStrategy.ORIGINAL,
+        hd_strategy_crop_margin=128,
+        hd_strategy_crop_trigger_size=128,
+        hd_strategy_resize_limit=128,
+        prompt="a fox is sitting on a bench",
+        sd_steps=5,
+        sd_sampler=SDSampler.ddim
+    )
+    model(image, mask, config)
+def benchmark(model, times: int, empty_cache: bool):
+    sizes = [(512, 512)]
+    nvidia_smi.nvmlInit()
+    device_id = 0
+    handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)
+    def format(metrics):
+        return f"{np.mean(metrics):.2f} ± {np.std(metrics):.2f}"
+    process = psutil.Process(os.getpid())
+    # 每个 size 给出显存和内存占用的指标
+    for size in sizes:
+        torch.cuda.empty_cache()
+        time_metrics = []
+        cpu_metrics = []
+        memory_metrics = []
+        gpu_memory_metrics = []
+        for _ in range(times):
+            start = time.time()
+            run_model(model, size)
+            torch.cuda.synchronize()
+            # cpu_metrics.append(process.cpu_percent())
+            time_metrics.append((time.time() - start) * 1000)
+            memory_metrics.append(process.memory_info().rss / 1024 / 1024)
+            gpu_memory_metrics.append(nvidia_smi.nvmlDeviceGetMemoryInfo(handle).used / 1024 / 1024)
+        print(f"size: {size}".center(80, "-"))
+        # print(f"cpu: {format(cpu_metrics)}")
+        print(f"latency: {format(time_metrics)}ms")
+        print(f"memory: {format(memory_metrics)} MB")
+        print(f"gpu memory: {format(gpu_memory_metrics)} MB")
+    nvidia_smi.nvmlShutdown()
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--name")
+    parser.add_argument("--device", default="cuda", type=str)
+    parser.add_argument("--times", default=10, type=int)
+    parser.add_argument("--empty-cache", action="store_true")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args_parser()
+    device = torch.device(args.device)
+    model = ModelManager(
+        name=args.name,
+        device=device,
+        sd_run_local=True,
+        disable_nsfw=True,
+        sd_cpu_textencoder=True,
+        hf_access_token="123"
+    )
+    benchmark(model, args.times, args.empty_cache)

lama_cleaner/const.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import json
+import os
+from enum import Enum
+from pydantic import BaseModel
+MPS_SUPPORT_MODELS = [
+    "instruct_pix2pix",
+    "sd1.5",
+    "anything4",
+    "realisticVision1.4",
+    "sd2",
+    "paint_by_example",
+    "controlnet",
+]
+DEFAULT_MODEL = "lama"
+AVAILABLE_MODELS = [
+    "lama",
+    "ldm",
+    "zits",
+    "mat",
+    "fcf",
+    "sd1.5",
+    "anything4",
+    "realisticVision1.4",
+    "cv2",
+    "manga",
+    "sd2",
+    "paint_by_example",
+    "instruct_pix2pix",
+]
+SD15_MODELS = ["sd1.5", "anything4", "realisticVision1.4"]
+AVAILABLE_DEVICES = ["cuda", "cpu", "mps"]
+DEFAULT_DEVICE = "cuda"
+NO_HALF_HELP = """
+Using full precision model.
+If your generate result is always black or green, use this argument. (sd/paint_by_exmaple)
+"""
+CPU_OFFLOAD_HELP = """
+Offloads all models to CPU, significantly reducing vRAM usage. (sd/paint_by_example)
+"""
+DISABLE_NSFW_HELP = """
+Disable NSFW checker. (sd/paint_by_example)
+"""
+SD_CPU_TEXTENCODER_HELP = """
+Run Stable Diffusion text encoder model on CPU to save GPU memory.
+"""
+SD_CONTROLNET_HELP = """
+Run Stable Diffusion inpainting model with ControlNet. You can switch control method in webui.
+"""
+DEFAULT_CONTROLNET_METHOD = "control_v11p_sd15_canny"
+SD_CONTROLNET_CHOICES = [
+    "control_v11p_sd15_canny",
+    "control_v11p_sd15_openpose",
+    "control_v11p_sd15_inpaint",
+    "control_v11f1p_sd15_depth"
+]
+SD_LOCAL_MODEL_HELP = """
+Load Stable Diffusion 1.5 model(ckpt/safetensors) from local path.
+"""
+LOCAL_FILES_ONLY_HELP = """
+Use local files only, not connect to Hugging Face server. (sd/paint_by_example)
+"""
+ENABLE_XFORMERS_HELP = """
+Enable xFormers optimizations. Requires xformers package has been installed. See: https://github.com/facebookresearch/xformers (sd/paint_by_example)
+"""
+DEFAULT_MODEL_DIR = os.getenv(
+    "XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")
+)
+MODEL_DIR_HELP = """
+Model download directory (by setting XDG_CACHE_HOME environment variable), by default model downloaded to ~/.cache
+"""
+OUTPUT_DIR_HELP = """
+Result images will be saved to output directory automatically without confirmation.
+"""
+INPUT_HELP = """
+If input is image, it will be loaded by default.
+If input is directory, you can browse and select image in file manager.
+"""
+GUI_HELP = """
+Launch Lama Cleaner as desktop app
+"""
+NO_GUI_AUTO_CLOSE_HELP = """
+Prevent backend auto close after the GUI window closed.
+"""
+QUALITY_HELP = """
+Quality of image encoding, 0-100. Default is 95, higher quality will generate larger file size.
+"""
+class RealESRGANModelName(str, Enum):
+    realesr_general_x4v3 = "realesr-general-x4v3"
+    RealESRGAN_x4plus = "RealESRGAN_x4plus"
+    RealESRGAN_x4plus_anime_6B = "RealESRGAN_x4plus_anime_6B"
+RealESRGANModelNameList = [e.value for e in RealESRGANModelName]
+INTERACTIVE_SEG_HELP = "Enable interactive segmentation using Segment Anything."
+INTERACTIVE_SEG_MODEL_HELP = "Model size: vit_b < vit_l < vit_h. Bigger model size means better segmentation but slower speed."
+AVAILABLE_INTERACTIVE_SEG_MODELS = ["vit_b", "vit_l", "vit_h"]
+AVAILABLE_INTERACTIVE_SEG_DEVICES = ["cuda", "cpu", "mps"]
+REMOVE_BG_HELP = "Enable remove background. Always run on CPU"
+ANIMESEG_HELP = "Enable anime segmentation. Always run on CPU"
+REALESRGAN_HELP = "Enable realesrgan super resolution"
+REALESRGAN_AVAILABLE_DEVICES = ["cpu", "cuda", "mps"]
+GFPGAN_HELP = (
+    "Enable GFPGAN face restore. To enhance background, use with --enable-realesrgan"
+)
+GFPGAN_AVAILABLE_DEVICES = ["cpu", "cuda", "mps"]
+RESTOREFORMER_HELP = "Enable RestoreFormer face restore. To enhance background, use with --enable-realesrgan"
+RESTOREFORMER_AVAILABLE_DEVICES = ["cpu", "cuda", "mps"]
+GIF_HELP = "Enable GIF plugin. Make GIF to compare original and cleaned image"
+class Config(BaseModel):
+    host: str = "127.0.0.1"
+    port: int = 8080
+    model: str = DEFAULT_MODEL
+    sd_local_model_path: str = None
+    sd_controlnet: bool = False
+    sd_controlnet_method: str = DEFAULT_CONTROLNET_METHOD
+    device: str = DEFAULT_DEVICE
+    gui: bool = False
+    no_gui_auto_close: bool = False
+    no_half: bool = False
+    cpu_offload: bool = False
+    disable_nsfw: bool = False
+    sd_cpu_textencoder: bool = False
+    enable_xformers: bool = False
+    local_files_only: bool = False
+    model_dir: str = DEFAULT_MODEL_DIR
+    input: str = None
+    output_dir: str = None
+    # plugins
+    enable_interactive_seg: bool = False
+    interactive_seg_model: str = "vit_l"
+    interactive_seg_device: str = "cpu"
+    enable_remove_bg: bool = False
+    enable_anime_seg: bool = False
+    enable_realesrgan: bool = False
+    realesrgan_device: str = "cpu"
+    realesrgan_model: str = RealESRGANModelName.realesr_general_x4v3.value
+    realesrgan_no_half: bool = False
+    enable_gfpgan: bool = False
+    gfpgan_device: str = "cpu"
+    enable_restoreformer: bool = False
+    restoreformer_device: str = "cpu"
+    enable_gif: bool = False
+def load_config(installer_config: str):
+    if os.path.exists(installer_config):
+        with open(installer_config, "r", encoding="utf-8") as f:
+            return Config(**json.load(f))
+    else:
+        return Config()

lama_cleaner/file_manager/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .file_manager import FileManager

lama_cleaner/file_manager/file_manager.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# Copy from https://github.com/silentsokolov/flask-thumbnails/blob/master/flask_thumbnails/thumbnail.py
+import os
+from datetime import datetime
+import cv2
+import time
+from io import BytesIO
+from pathlib import Path
+import numpy as np
+# from watchdog.events import FileSystemEventHandler
+# from watchdog.observers import Observer
+from PIL import Image, ImageOps, PngImagePlugin
+from loguru import logger
+LARGE_ENOUGH_NUMBER = 100
+PngImagePlugin.MAX_TEXT_CHUNK = LARGE_ENOUGH_NUMBER * (1024**2)
+from .storage_backends import FilesystemStorageBackend
+from .utils import aspect_to_string, generate_filename, glob_img
+class FileManager:
+    def __init__(self, app=None):
+        self.app = app
+        self._default_root_directory = "media"
+        self._default_thumbnail_directory = "media"
+        self._default_root_url = "/"
+        self._default_thumbnail_root_url = "/"
+        self._default_format = "JPEG"
+        self.output_dir: Path = None
+        if app is not None:
+            self.init_app(app)
+        self.image_dir_filenames = []
+        self.output_dir_filenames = []
+        self.image_dir_observer = None
+        self.output_dir_observer = None
+        self.modified_time = {
+            "image": datetime.utcnow(),
+            "output": datetime.utcnow(),
+        }
+    # def start(self):
+    #     self.image_dir_filenames = self._media_names(self.root_directory)
+    #     self.output_dir_filenames = self._media_names(self.output_dir)
+    #
+    #     logger.info(f"Start watching image directory: {self.root_directory}")
+    #     self.image_dir_observer = Observer()
+    #     self.image_dir_observer.schedule(self, self.root_directory, recursive=False)
+    #     self.image_dir_observer.start()
+    #
+    #     logger.info(f"Start watching output directory: {self.output_dir}")
+    #     self.output_dir_observer = Observer()
+    #     self.output_dir_observer.schedule(self, self.output_dir, recursive=False)
+    #     self.output_dir_observer.start()
+    def on_modified(self, event):
+        if not os.path.isdir(event.src_path):
+            return
+        if event.src_path == str(self.root_directory):
+            logger.info(f"Image directory {event.src_path} modified")
+            self.image_dir_filenames = self._media_names(self.root_directory)
+            self.modified_time["image"] = datetime.utcnow()
+        elif event.src_path == str(self.output_dir):
+            logger.info(f"Output directory {event.src_path} modified")
+            self.output_dir_filenames = self._media_names(self.output_dir)
+            self.modified_time["output"] = datetime.utcnow()
+    def init_app(self, app):
+        if self.app is None:
+            self.app = app
+        app.thumbnail_instance = self
+        if not hasattr(app, "extensions"):
+            app.extensions = {}
+        if "thumbnail" in app.extensions:
+            raise RuntimeError("Flask-thumbnail extension already initialized")
+        app.extensions["thumbnail"] = self
+        app.config.setdefault("THUMBNAIL_MEDIA_ROOT", self._default_root_directory)
+        app.config.setdefault(
+            "THUMBNAIL_MEDIA_THUMBNAIL_ROOT", self._default_thumbnail_directory
+        )
+        app.config.setdefault("THUMBNAIL_MEDIA_URL", self._default_root_url)
+        app.config.setdefault(
+            "THUMBNAIL_MEDIA_THUMBNAIL_URL", self._default_thumbnail_root_url
+        )
+        app.config.setdefault("THUMBNAIL_DEFAULT_FORMAT", self._default_format)
+    @property
+    def root_directory(self):
+        path = self.app.config["THUMBNAIL_MEDIA_ROOT"]
+        if os.path.isabs(path):
+            return path
+        else:
+            return os.path.join(self.app.root_path, path)
+    @property
+    def thumbnail_directory(self):
+        path = self.app.config["THUMBNAIL_MEDIA_THUMBNAIL_ROOT"]
+        if os.path.isabs(path):
+            return path
+        else:
+            return os.path.join(self.app.root_path, path)
+    @property
+    def root_url(self):
+        return self.app.config["THUMBNAIL_MEDIA_URL"]
+    @property
+    def media_names(self):
+        # return self.image_dir_filenames
+        return self._media_names(self.root_directory)
+    @property
+    def output_media_names(self):
+        return self._media_names(self.output_dir)
+        # return self.output_dir_filenames
+    @staticmethod
+    def _media_names(directory: Path):
+        names = sorted([it.name for it in glob_img(directory)])
+        res = []
+        for name in names:
+            path = os.path.join(directory, name)
+            img = Image.open(path)
+            res.append(
+                {
+                    "name": name,
+                    "height": img.height,
+                    "width": img.width,
+                    "ctime": os.path.getctime(path),
+                    "mtime": os.path.getmtime(path),
+                }
+            )
+        return res
+    @property
+    def thumbnail_url(self):
+        return self.app.config["THUMBNAIL_MEDIA_THUMBNAIL_URL"]
+    def get_thumbnail(
+        self, directory: Path, original_filename: str, width, height, **options
+    ):
+        storage = FilesystemStorageBackend(self.app)
+        crop = options.get("crop", "fit")
+        background = options.get("background")
+        quality = options.get("quality", 90)
+        original_path, original_filename = os.path.split(original_filename)
+        original_filepath = os.path.join(directory, original_path, original_filename)
+        image = Image.open(BytesIO(storage.read(original_filepath)))
+        # keep ratio resize
+        if width is not None:
+            height = int(image.height * width / image.width)
+        else:
+            width = int(image.width * height / image.height)
+        thumbnail_size = (width, height)
+        thumbnail_filename = generate_filename(
+            original_filename,
+            aspect_to_string(thumbnail_size),
+            crop,
+            background,
+            quality,
+        )
+        thumbnail_filepath = os.path.join(
+            self.thumbnail_directory, original_path, thumbnail_filename
+        )
+        thumbnail_url = os.path.join(
+            self.thumbnail_url, original_path, thumbnail_filename
+        )
+        if storage.exists(thumbnail_filepath):
+            return thumbnail_url, (width, height)
+        try:
+            image.load()
+        except (IOError, OSError):
+            self.app.logger.warning("Thumbnail not load image: %s", original_filepath)
+            return thumbnail_url, (width, height)
+        # get original image format
+        options["format"] = options.get("format", image.format)
+        image = self._create_thumbnail(
+            image, thumbnail_size, crop, background=background
+        )
+        raw_data = self.get_raw_data(image, **options)
+        storage.save(thumbnail_filepath, raw_data)
+        return thumbnail_url, (width, height)
+    def get_raw_data(self, image, **options):
+        data = {
+            "format": self._get_format(image, **options),
+            "quality": options.get("quality", 90),
+        }
+        _file = BytesIO()
+        image.save(_file, **data)
+        return _file.getvalue()
+    @staticmethod
+    def colormode(image, colormode="RGB"):
+        if colormode == "RGB" or colormode == "RGBA":
+            if image.mode == "RGBA":
+                return image
+            if image.mode == "LA":
+                return image.convert("RGBA")
+            return image.convert(colormode)
+        if colormode == "GRAY":
+            return image.convert("L")
+        return image.convert(colormode)
+    @staticmethod
+    def background(original_image, color=0xFF):
+        size = (max(original_image.size),) * 2
+        image = Image.new("L", size, color)
+        image.paste(
+            original_image,
+            tuple(map(lambda x: (x[0] - x[1]) / 2, zip(size, original_image.size))),
+        )
+        return image
+    def _get_format(self, image, **options):
+        if options.get("format"):
+            return options.get("format")
+        if image.format:
+            return image.format
+        return self.app.config["THUMBNAIL_DEFAULT_FORMAT"]
+    def _create_thumbnail(self, image, size, crop="fit", background=None):
+        try:
+            resample = Image.Resampling.LANCZOS
+        except AttributeError:  # pylint: disable=raise-missing-from
+            resample = Image.ANTIALIAS
+        if crop == "fit":
+            image = ImageOps.fit(image, size, resample)
+        else:
+            image = image.copy()
+            image.thumbnail(size, resample=resample)
+        if background is not None:
+            image = self.background(image)
+        image = self.colormode(image)
+        return image

lama_cleaner/file_manager/storage_backends.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copy from https://github.com/silentsokolov/flask-thumbnails/blob/master/flask_thumbnails/storage_backends.py
+import errno
+import os
+from abc import ABC, abstractmethod
+class BaseStorageBackend(ABC):
+    def __init__(self, app=None):
+        self.app = app
+    @abstractmethod
+    def read(self, filepath, mode="rb", **kwargs):
+        raise NotImplementedError
+    @abstractmethod
+    def exists(self, filepath):
+        raise NotImplementedError
+    @abstractmethod
+    def save(self, filepath, data):
+        raise NotImplementedError
+class FilesystemStorageBackend(BaseStorageBackend):
+    def read(self, filepath, mode="rb", **kwargs):
+        with open(filepath, mode) as f:  # pylint: disable=unspecified-encoding
+            return f.read()
+    def exists(self, filepath):
+        return os.path.exists(filepath)
+    def save(self, filepath, data):
+        directory = os.path.dirname(filepath)
+        if not os.path.exists(directory):
+            try:
+                os.makedirs(directory)
+            except OSError as e:
+                if e.errno != errno.EEXIST:
+                    raise
+        if not os.path.isdir(directory):
+            raise IOError("{} is not a directory".format(directory))
+        with open(filepath, "wb") as f:
+            f.write(data)

lama_cleaner/file_manager/utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copy from: https://github.com/silentsokolov/flask-thumbnails/blob/master/flask_thumbnails/utils.py
+import importlib
+import os
+from pathlib import Path
+from typing import Union
+def generate_filename(original_filename, *options):
+    name, ext = os.path.splitext(original_filename)
+    for v in options:
+        if v:
+            name += "_%s" % v
+    name += ext
+    return name
+def parse_size(size):
+    if isinstance(size, int):
+        # If the size parameter is a single number, assume square aspect.
+        return [size, size]
+    if isinstance(size, (tuple, list)):
+        if len(size) == 1:
+            # If single value tuple/list is provided, exand it to two elements
+            return size + type(size)(size)
+        return size
+    try:
+        thumbnail_size = [int(x) for x in size.lower().split("x", 1)]
+    except ValueError:
+        raise ValueError(  # pylint: disable=raise-missing-from
+            "Bad thumbnail size format. Valid format is INTxINT."
+        )
+    if len(thumbnail_size) == 1:
+        # If the size parameter only contains a single integer, assume square aspect.
+        thumbnail_size.append(thumbnail_size[0])
+    return thumbnail_size
+def aspect_to_string(size):
+    if isinstance(size, str):
+        return size
+    return "x".join(map(str, size))
+IMG_SUFFIX = {'.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG'}
+def glob_img(p: Union[Path, str], recursive: bool = False):
+    p = Path(p)
+    if p.is_file() and p.suffix in IMG_SUFFIX:
+        yield p
+    else:
+        if recursive:
+            files = Path(p).glob("**/*.*")
+        else:
+            files = Path(p).glob("*.*")
+        for it in files:
+            if it.suffix not in IMG_SUFFIX:
+                continue
+            yield it

lama_cleaner/helper.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import io
+import os
+import sys
+from typing import List, Optional
+from urllib.parse import urlparse
+import cv2
+from PIL import Image, ImageOps, PngImagePlugin
+import numpy as np
+import torch
+from lama_cleaner.const import MPS_SUPPORT_MODELS
+from loguru import logger
+from torch.hub import download_url_to_file, get_dir
+import hashlib
+def md5sum(filename):
+    md5 = hashlib.md5()
+    with open(filename, "rb") as f:
+        for chunk in iter(lambda: f.read(128 * md5.block_size), b""):
+            md5.update(chunk)
+    return md5.hexdigest()
+def switch_mps_device(model_name, device):
+    if model_name not in MPS_SUPPORT_MODELS and str(device) == "mps":
+        logger.info(f"{model_name} not support mps, switch to cpu")
+        return torch.device("cpu")
+    return device
+def get_cache_path_by_url(url):
+    parts = urlparse(url)
+    hub_dir = get_dir()
+    model_dir = os.path.join(hub_dir, "checkpoints")
+    if not os.path.isdir(model_dir):
+        os.makedirs(model_dir)
+    filename = os.path.basename(parts.path)
+    cached_file = os.path.join(model_dir, filename)
+    return cached_file
+def download_model(url, model_md5: str = None):
+    cached_file = get_cache_path_by_url(url)
+    if not os.path.exists(cached_file):
+        sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
+        hash_prefix = None
+        download_url_to_file(url, cached_file, hash_prefix, progress=True)
+        if model_md5:
+            _md5 = md5sum(cached_file)
+            if model_md5 == _md5:
+                logger.info(f"Download model success, md5: {_md5}")
+            else:
+                try:
+                    os.remove(cached_file)
+                    logger.error(
+                        f"Model md5: {_md5}, expected md5: {model_md5}, wrong model deleted. Please restart lama-cleaner."
+                        f"If you still have errors, please try download model manually first https://lama-cleaner-docs.vercel.app/install/download_model_manually.\n"
+                    )
+                except:
+                    logger.error(
+                        f"Model md5: {_md5}, expected md5: {model_md5}, please delete {cached_file} and restart lama-cleaner."
+                    )
+                exit(-1)
+    return cached_file
+def ceil_modulo(x, mod):
+    if x % mod == 0:
+        return x
+    return (x // mod + 1) * mod
+def handle_error(model_path, model_md5, e):
+    _md5 = md5sum(model_path)
+    if _md5 != model_md5:
+        try:
+            os.remove(model_path)
+            logger.error(
+                f"Model md5: {_md5}, expected md5: {model_md5}, wrong model deleted. Please restart lama-cleaner."
+                f"If you still have errors, please try download model manually first https://lama-cleaner-docs.vercel.app/install/download_model_manually.\n"
+            )
+        except:
+            logger.error(
+                f"Model md5: {_md5}, expected md5: {model_md5}, please delete {model_path} and restart lama-cleaner."
+            )
+    else:
+        logger.error(
+            f"Failed to load model {model_path},"
+            f"please submit an issue at https://github.com/Sanster/lama-cleaner/issues and include a screenshot of the error:\n{e}"
+        )
+    exit(-1)
+def load_jit_model(url_or_path, device, model_md5: str):
+    if os.path.exists(url_or_path):
+        model_path = url_or_path
+    else:
+        model_path = download_model(url_or_path, model_md5)
+    logger.info(f"Loading model from: {model_path}")
+    try:
+        model = torch.jit.load(model_path, map_location="cpu").to(device)
+    except Exception as e:
+        handle_error(model_path, model_md5, e)
+    model.eval()
+    return model
+def load_model(model: torch.nn.Module, url_or_path, device, model_md5):
+    if os.path.exists(url_or_path):
+        model_path = url_or_path
+    else:
+        model_path = download_model(url_or_path, model_md5)
+    try:
+        logger.info(f"Loading model from: {model_path}")
+        state_dict = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+        model.to(device)
+    except Exception as e:
+        handle_error(model_path, model_md5, e)
+    model.eval()
+    return model
+def numpy_to_bytes(image_numpy: np.ndarray, ext: str) -> bytes:
+    data = cv2.imencode(
+        f".{ext}",
+        image_numpy,
+        [int(cv2.IMWRITE_JPEG_QUALITY), 100, int(cv2.IMWRITE_PNG_COMPRESSION), 0],
+    )[1]
+    image_bytes = data.tobytes()
+    return image_bytes
+def pil_to_bytes(pil_img, ext: str, quality: int = 95, exif_infos={}) -> bytes:
+    with io.BytesIO() as output:
+        kwargs = {k: v for k, v in exif_infos.items() if v is not None}
+        if ext == "png" and "parameters" in kwargs:
+            pnginfo_data = PngImagePlugin.PngInfo()
+            pnginfo_data.add_text("parameters", kwargs["parameters"])
+            kwargs["pnginfo"] = pnginfo_data
+        pil_img.save(
+            output,
+            format=ext,
+            quality=quality,
+            **kwargs,
+        )
+        image_bytes = output.getvalue()
+    return image_bytes
+def load_img(img_bytes, gray: bool = False, return_exif: bool = False):
+    alpha_channel = None
+    image = Image.open(io.BytesIO(img_bytes))
+    if return_exif:
+        info = image.info or {}
+        exif_infos = {"exif": image.getexif(), "parameters": info.get("parameters")}
+    try:
+        image = ImageOps.exif_transpose(image)
+    except:
+        pass
+    if gray:
+        image = image.convert("L")
+        np_img = np.array(image)
+    else:
+        if image.mode == "RGBA":
+            np_img = np.array(image)
+            alpha_channel = np_img[:, :, -1]
+            np_img = cv2.cvtColor(np_img, cv2.COLOR_RGBA2RGB)
+        else:
+            image = image.convert("RGB")
+            np_img = np.array(image)
+    if return_exif:
+        return np_img, alpha_channel, exif_infos
+    return np_img, alpha_channel
+def norm_img(np_img):
+    if len(np_img.shape) == 2:
+        np_img = np_img[:, :, np.newaxis]
+    np_img = np.transpose(np_img, (2, 0, 1))
+    np_img = np_img.astype("float32") / 255
+    return np_img
+def resize_max_size(
+    np_img, size_limit: int, interpolation=cv2.INTER_CUBIC
+) -> np.ndarray:
+    # Resize image's longer size to size_limit if longer size larger than size_limit
+    h, w = np_img.shape[:2]
+    if max(h, w) > size_limit:
+        ratio = size_limit / max(h, w)
+        new_w = int(w * ratio + 0.5)
+        new_h = int(h * ratio + 0.5)
+        return cv2.resize(np_img, dsize=(new_w, new_h), interpolation=interpolation)
+    else:
+        return np_img
+def pad_img_to_modulo(
+    img: np.ndarray, mod: int, square: bool = False, min_size: Optional[int] = None
+):
+    """
+    Args:
+        img: [H, W, C]
+        mod:
+        square: 是否为正方形
+        min_size:
+    Returns:
+    """
+    if len(img.shape) == 2:
+        img = img[:, :, np.newaxis]
+    height, width = img.shape[:2]
+    out_height = ceil_modulo(height, mod)
+    out_width = ceil_modulo(width, mod)
+    if min_size is not None:
+        assert min_size % mod == 0
+        out_width = max(min_size, out_width)
+        out_height = max(min_size, out_height)
+    if square:
+        max_size = max(out_height, out_width)
+        out_height = max_size
+        out_width = max_size
+    return np.pad(
+        img,
+        ((0, out_height - height), (0, out_width - width), (0, 0)),
+        mode="symmetric",
+    )
+def boxes_from_mask(mask: np.ndarray) -> List[np.ndarray]:
+    """
+    Args:
+        mask: (h, w, 1)  0~255
+    Returns:
+    """
+    height, width = mask.shape[:2]
+    _, thresh = cv2.threshold(mask, 127, 255, 0)
+    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    boxes = []
+    for cnt in contours:
+        x, y, w, h = cv2.boundingRect(cnt)
+        box = np.array([x, y, x + w, y + h]).astype(int)
+        box[::2] = np.clip(box[::2], 0, width)
+        box[1::2] = np.clip(box[1::2], 0, height)
+        boxes.append(box)
+    return boxes
+def only_keep_largest_contour(mask: np.ndarray) -> List[np.ndarray]:
+    """
+    Args:
+        mask: (h, w)  0~255
+    Returns:
+    """
+    _, thresh = cv2.threshold(mask, 127, 255, 0)
+    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    max_area = 0
+    max_index = -1
+    for i, cnt in enumerate(contours):
+        area = cv2.contourArea(cnt)
+        if area > max_area:
+            max_area = area
+            max_index = i
+    if max_index != -1:
+        new_mask = np.zeros_like(mask)
+        return cv2.drawContours(new_mask, contours, max_index, 255, -1)
+    else:
+        return mask

lama_cleaner/installer.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import subprocess
+import sys
+def install(package):
+    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
+def install_plugins_package():
+    install("rembg")
+    install("realesrgan")
+    install("gfpgan")

lama_cleaner/model/__init__.py ADDED Viewed

File without changes

lama_cleaner/model/base.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import abc
+from typing import Optional
+import cv2
+import torch
+import numpy as np
+from loguru import logger
+from lama_cleaner.helper import (
+    boxes_from_mask,
+    resize_max_size,
+    pad_img_to_modulo,
+    switch_mps_device,
+)
+from lama_cleaner.schema import Config, HDStrategy
+class InpaintModel:
+    name = "base"
+    min_size: Optional[int] = None
+    pad_mod = 8
+    pad_to_square = False
+    def __init__(self, device, **kwargs):
+        """
+        Args:
+            device:
+        """
+        device = switch_mps_device(self.name, device)
+        self.device = device
+        self.init_model(device, **kwargs)
+    @abc.abstractmethod
+    def init_model(self, device, **kwargs):
+        ...
+    @staticmethod
+    @abc.abstractmethod
+    def is_downloaded() -> bool:
+        ...
+    @abc.abstractmethod
+    def forward(self, image, mask, config: Config):
+        """Input images and output images have same size
+        images: [H, W, C] RGB
+        masks: [H, W, 1] 255 为 masks 区域
+        return: BGR IMAGE
+        """
+        ...
+    def _pad_forward(self, image, mask, config: Config):
+        origin_height, origin_width = image.shape[:2]
+        pad_image = pad_img_to_modulo(
+            image, mod=self.pad_mod, square=self.pad_to_square, min_size=self.min_size
+        )
+        pad_mask = pad_img_to_modulo(
+            mask, mod=self.pad_mod, square=self.pad_to_square, min_size=self.min_size
+        )
+        logger.info(f"final forward pad size: {pad_image.shape}")
+        result = self.forward(pad_image, pad_mask, config)
+        result = result[0:origin_height, 0:origin_width, :]
+        result, image, mask = self.forward_post_process(result, image, mask, config)
+        mask = mask[:, :, np.newaxis]
+        result = result * (mask / 255) + image[:, :, ::-1] * (1 - (mask / 255))
+        return result
+    def forward_post_process(self, result, image, mask, config):
+        return result, image, mask
+    @torch.no_grad()
+    def __call__(self, image, mask, config: Config):
+        """
+        images: [H, W, C] RGB, not normalized
+        masks: [H, W]
+        return: BGR IMAGE
+        """
+        inpaint_result = None
+        logger.info(f"hd_strategy: {config.hd_strategy}")
+        if config.hd_strategy == HDStrategy.CROP:
+            if max(image.shape) > config.hd_strategy_crop_trigger_size:
+                logger.info(f"Run crop strategy")
+                boxes = boxes_from_mask(mask)
+                crop_result = []
+                for box in boxes:
+                    crop_image, crop_box = self._run_box(image, mask, box, config)
+                    crop_result.append((crop_image, crop_box))
+                inpaint_result = image[:, :, ::-1]
+                for crop_image, crop_box in crop_result:
+                    x1, y1, x2, y2 = crop_box
+                    inpaint_result[y1:y2, x1:x2, :] = crop_image
+        elif config.hd_strategy == HDStrategy.RESIZE:
+            if max(image.shape) > config.hd_strategy_resize_limit:
+                origin_size = image.shape[:2]
+                downsize_image = resize_max_size(
+                    image, size_limit=config.hd_strategy_resize_limit
+                )
+                downsize_mask = resize_max_size(
+                    mask, size_limit=config.hd_strategy_resize_limit
+                )
+                logger.info(
+                    f"Run resize strategy, origin size: {image.shape} forward size: {downsize_image.shape}"
+                )
+                inpaint_result = self._pad_forward(
+                    downsize_image, downsize_mask, config
+                )
+                # only paste masked area result
+                inpaint_result = cv2.resize(
+                    inpaint_result,
+                    (origin_size[1], origin_size[0]),
+                    interpolation=cv2.INTER_CUBIC,
+                )
+                original_pixel_indices = mask < 127
+                inpaint_result[original_pixel_indices] = image[:, :, ::-1][
+                    original_pixel_indices
+                ]
+        if inpaint_result is None:
+            inpaint_result = self._pad_forward(image, mask, config)
+        return inpaint_result
+    def _crop_box(self, image, mask, box, config: Config):
+        """
+        Args:
+            image: [H, W, C] RGB
+            mask: [H, W, 1]
+            box: [left,top,right,bottom]
+        Returns:
+            BGR IMAGE, (l, r, r, b)
+        """
+        box_h = box[3] - box[1]
+        box_w = box[2] - box[0]
+        cx = (box[0] + box[2]) // 2
+        cy = (box[1] + box[3]) // 2
+        img_h, img_w = image.shape[:2]
+        w = box_w + config.hd_strategy_crop_margin * 2
+        h = box_h + config.hd_strategy_crop_margin * 2
+        _l = cx - w // 2
+        _r = cx + w // 2
+        _t = cy - h // 2
+        _b = cy + h // 2
+        l = max(_l, 0)
+        r = min(_r, img_w)
+        t = max(_t, 0)
+        b = min(_b, img_h)
+        # try to get more context when crop around image edge
+        if _l < 0:
+            r += abs(_l)
+        if _r > img_w:
+            l -= _r - img_w
+        if _t < 0:
+            b += abs(_t)
+        if _b > img_h:
+            t -= _b - img_h
+        l = max(l, 0)
+        r = min(r, img_w)
+        t = max(t, 0)
+        b = min(b, img_h)
+        crop_img = image[t:b, l:r, :]
+        crop_mask = mask[t:b, l:r]
+        logger.info(f"box size: ({box_h},{box_w}) crop size: {crop_img.shape}")
+        return crop_img, crop_mask, [l, t, r, b]
+    def _calculate_cdf(self, histogram):
+        cdf = histogram.cumsum()
+        normalized_cdf = cdf / float(cdf.max())
+        return normalized_cdf
+    def _calculate_lookup(self, source_cdf, reference_cdf):
+        lookup_table = np.zeros(256)
+        lookup_val = 0
+        for source_index, source_val in enumerate(source_cdf):
+            for reference_index, reference_val in enumerate(reference_cdf):
+                if reference_val >= source_val:
+                    lookup_val = reference_index
+                    break
+            lookup_table[source_index] = lookup_val
+        return lookup_table
+    def _match_histograms(self, source, reference, mask):
+        transformed_channels = []
+        for channel in range(source.shape[-1]):
+            source_channel = source[:, :, channel]
+            reference_channel = reference[:, :, channel]
+            # only calculate histograms for non-masked parts
+            source_histogram, _ = np.histogram(source_channel[mask == 0], 256, [0, 256])
+            reference_histogram, _ = np.histogram(
+                reference_channel[mask == 0], 256, [0, 256]
+            )
+            source_cdf = self._calculate_cdf(source_histogram)
+            reference_cdf = self._calculate_cdf(reference_histogram)
+            lookup = self._calculate_lookup(source_cdf, reference_cdf)
+            transformed_channels.append(cv2.LUT(source_channel, lookup))
+        result = cv2.merge(transformed_channels)
+        result = cv2.convertScaleAbs(result)
+        return result
+    def _apply_cropper(self, image, mask, config: Config):
+        img_h, img_w = image.shape[:2]
+        l, t, w, h = (
+            config.croper_x,
+            config.croper_y,
+            config.croper_width,
+            config.croper_height,
+        )
+        r = l + w
+        b = t + h
+        l = max(l, 0)
+        r = min(r, img_w)
+        t = max(t, 0)
+        b = min(b, img_h)
+        crop_img = image[t:b, l:r, :]
+        crop_mask = mask[t:b, l:r]
+        return crop_img, crop_mask, (l, t, r, b)
+    def _run_box(self, image, mask, box, config: Config):
+        """
+        Args:
+            image: [H, W, C] RGB
+            mask: [H, W, 1]
+            box: [left,top,right,bottom]
+        Returns:
+            BGR IMAGE
+        """
+        crop_img, crop_mask, [l, t, r, b] = self._crop_box(image, mask, box, config)
+        return self._pad_forward(crop_img, crop_mask, config), [l, t, r, b]
+class DiffusionInpaintModel(InpaintModel):
+    @torch.no_grad()
+    def __call__(self, image, mask, config: Config):
+        """
+        images: [H, W, C] RGB, not normalized
+        masks: [H, W]
+        return: BGR IMAGE
+        """
+        # boxes = boxes_from_mask(mask)
+        if config.use_croper:
+            crop_img, crop_mask, (l, t, r, b) = self._apply_cropper(image, mask, config)
+            crop_image = self._scaled_pad_forward(crop_img, crop_mask, config)
+            inpaint_result = image[:, :, ::-1]
+            inpaint_result[t:b, l:r, :] = crop_image
+        else:
+            inpaint_result = self._scaled_pad_forward(image, mask, config)
+        return inpaint_result
+    def _scaled_pad_forward(self, image, mask, config: Config):
+        longer_side_length = int(config.sd_scale * max(image.shape[:2]))
+        origin_size = image.shape[:2]
+        downsize_image = resize_max_size(image, size_limit=longer_side_length)
+        downsize_mask = resize_max_size(mask, size_limit=longer_side_length)
+        if config.sd_scale != 1:
+            logger.info(
+                f"Resize image to do sd inpainting: {image.shape} -> {downsize_image.shape}"
+            )
+        inpaint_result = self._pad_forward(downsize_image, downsize_mask, config)
+        # only paste masked area result
+        inpaint_result = cv2.resize(
+            inpaint_result,
+            (origin_size[1], origin_size[0]),
+            interpolation=cv2.INTER_CUBIC,
+        )
+        original_pixel_indices = mask < 127
+        inpaint_result[original_pixel_indices] = image[:, :, ::-1][
+            original_pixel_indices
+        ]
+        return inpaint_result

lama_cleaner/model/controlnet.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import gc
+import PIL.Image
+import cv2
+import numpy as np
+import torch
+from diffusers import ControlNetModel
+from loguru import logger
+from lama_cleaner.model.base import DiffusionInpaintModel
+from lama_cleaner.model.utils import torch_gc, get_scheduler
+from lama_cleaner.schema import Config
+class CPUTextEncoderWrapper:
+    def __init__(self, text_encoder, torch_dtype):
+        self.config = text_encoder.config
+        self.text_encoder = text_encoder.to(torch.device("cpu"), non_blocking=True)
+        self.text_encoder = self.text_encoder.to(torch.float32, non_blocking=True)
+        self.torch_dtype = torch_dtype
+        del text_encoder
+        torch_gc()
+    def __call__(self, x, **kwargs):
+        input_device = x.device
+        return [
+            self.text_encoder(x.to(self.text_encoder.device), **kwargs)[0]
+            .to(input_device)
+            .to(self.torch_dtype)
+        ]
+    @property
+    def dtype(self):
+        return self.torch_dtype
+NAMES_MAP = {
+    "sd1.5": "runwayml/stable-diffusion-inpainting",
+    "anything4": "Sanster/anything-4.0-inpainting",
+    "realisticVision1.4": "Sanster/Realistic_Vision_V1.4-inpainting",
+}
+NATIVE_NAMES_MAP = {
+    "sd1.5": "runwayml/stable-diffusion-v1-5",
+    "anything4": "andite/anything-v4.0",
+    "realisticVision1.4": "SG161222/Realistic_Vision_V1.4",
+}
+def make_inpaint_condition(image, image_mask):
+    """
+    image: [H, W, C] RGB
+    mask: [H, W, 1] 255 means area to repaint
+    """
+    image = image.astype(np.float32) / 255.0
+    image[image_mask[:, :, -1] > 128] = -1.0  # set as masked pixel
+    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return image
+def load_from_local_model(
+    local_model_path, torch_dtype, controlnet, pipe_class, is_native_control_inpaint
+):
+    from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
+        download_from_original_stable_diffusion_ckpt,
+    )
+    logger.info(f"Converting {local_model_path} to diffusers controlnet pipeline")
+    try:
+        pipe = download_from_original_stable_diffusion_ckpt(
+            local_model_path,
+            num_in_channels=4 if is_native_control_inpaint else 9,
+            from_safetensors=local_model_path.endswith("safetensors"),
+            device="cpu",
+            load_safety_checker=False,
+        )
+    except Exception as e:
+        err_msg = str(e)
+        logger.exception(e)
+        if is_native_control_inpaint and "[320, 9, 3, 3]" in err_msg:
+            logger.error(
+                "control_v11p_sd15_inpaint method requires normal SD model, not inpainting SD model"
+            )
+        if not is_native_control_inpaint and "[320, 4, 3, 3]" in err_msg:
+            logger.error(
+                f"{controlnet.config['_name_or_path']} method requires inpainting SD model, "
+                f"you can convert any SD model to inpainting model in AUTO1111: \n"
+                f"https://www.reddit.com/r/StableDiffusion/comments/zyi24j/how_to_turn_any_model_into_an_inpainting_model/"
+            )
+        exit(-1)
+    inpaint_pipe = pipe_class(
+        vae=pipe.vae,
+        text_encoder=pipe.text_encoder,
+        tokenizer=pipe.tokenizer,
+        unet=pipe.unet,
+        controlnet=controlnet,
+        scheduler=pipe.scheduler,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+    )
+    del pipe
+    gc.collect()
+    return inpaint_pipe.to(torch_dtype=torch_dtype)
+class ControlNet(DiffusionInpaintModel):
+    name = "controlnet"
+    pad_mod = 8
+    min_size = 512
+    def init_model(self, device: torch.device, **kwargs):
+        fp16 = not kwargs.get("no_half", False)
+        model_kwargs = {
+            "local_files_only": kwargs.get("local_files_only", kwargs["sd_run_local"])
+        }
+        if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False):
+            logger.info("Disable Stable Diffusion Model NSFW checker")
+            model_kwargs.update(
+                dict(
+                    safety_checker=None,
+                    feature_extractor=None,
+                    requires_safety_checker=False,
+                )
+            )
+        use_gpu = device == torch.device("cuda") and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+        sd_controlnet_method = kwargs["sd_controlnet_method"]
+        self.sd_controlnet_method = sd_controlnet_method
+        if sd_controlnet_method == "control_v11p_sd15_inpaint":
+            from diffusers import StableDiffusionControlNetPipeline as PipeClass
+            self.is_native_control_inpaint = True
+        else:
+            from .pipeline import StableDiffusionControlNetInpaintPipeline as PipeClass
+            self.is_native_control_inpaint = False
+        if self.is_native_control_inpaint:
+            model_id = NATIVE_NAMES_MAP[kwargs["name"]]
+        else:
+            model_id = NAMES_MAP[kwargs["name"]]
+        controlnet = ControlNetModel.from_pretrained(
+            f"lllyasviel/{sd_controlnet_method}", torch_dtype=torch_dtype
+        )
+        self.is_local_sd_model = False
+        if kwargs.get("sd_local_model_path", None):
+            self.is_local_sd_model = True
+            self.model = load_from_local_model(
+                kwargs["sd_local_model_path"],
+                torch_dtype=torch_dtype,
+                controlnet=controlnet,
+                pipe_class=PipeClass,
+                is_native_control_inpaint=self.is_native_control_inpaint,
+            )
+        else:
+            self.model = PipeClass.from_pretrained(
+                model_id,
+                controlnet=controlnet,
+                revision="fp16" if use_gpu and fp16 else "main",
+                torch_dtype=torch_dtype,
+                **model_kwargs,
+            )
+        # https://huggingface.co/docs/diffusers/v0.7.0/en/api/pipelines/stable_diffusion#diffusers.StableDiffusionInpaintPipeline.enable_attention_slicing
+        self.model.enable_attention_slicing()
+        # https://huggingface.co/docs/diffusers/v0.7.0/en/optimization/fp16#memory-efficient-attention
+        if kwargs.get("enable_xformers", False):
+            self.model.enable_xformers_memory_efficient_attention()
+        if kwargs.get("cpu_offload", False) and use_gpu:
+            logger.info("Enable sequential cpu offload")
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+            if kwargs["sd_cpu_textencoder"]:
+                logger.info("Run Stable Diffusion TextEncoder on CPU")
+                self.model.text_encoder = CPUTextEncoderWrapper(
+                    self.model.text_encoder, torch_dtype
+                )
+        self.callback = kwargs.pop("callback", None)
+    def forward(self, image, mask, config: Config):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        """
+        scheduler_config = self.model.scheduler.config
+        scheduler = get_scheduler(config.sd_sampler, scheduler_config)
+        self.model.scheduler = scheduler
+        if config.sd_mask_blur != 0:
+            k = 2 * config.sd_mask_blur + 1
+            mask = cv2.GaussianBlur(mask, (k, k), 0)[:, :, np.newaxis]
+        img_h, img_w = image.shape[:2]
+        if self.is_native_control_inpaint:
+            control_image = make_inpaint_condition(image, mask)
+            output = self.model(
+                prompt=config.prompt,
+                image=control_image,
+                height=img_h,
+                width=img_w,
+                num_inference_steps=config.sd_steps,
+                guidance_scale=config.sd_guidance_scale,
+                controlnet_conditioning_scale=config.controlnet_conditioning_scale,
+                negative_prompt=config.negative_prompt,
+                generator=torch.manual_seed(config.sd_seed),
+                output_type="np.array",
+                callback=self.callback,
+            ).images[0]
+        else:
+            if "canny" in self.sd_controlnet_method:
+                canny_image = cv2.Canny(image, 100, 200)
+                canny_image = canny_image[:, :, None]
+                canny_image = np.concatenate(
+                    [canny_image, canny_image, canny_image], axis=2
+                )
+                canny_image = PIL.Image.fromarray(canny_image)
+                control_image = canny_image
+            elif "openpose" in self.sd_controlnet_method:
+                from controlnet_aux import OpenposeDetector
+                processor = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
+                control_image = processor(image, hand_and_face=True)
+            elif "depth" in self.sd_controlnet_method:
+                from transformers import pipeline
+                depth_estimator = pipeline("depth-estimation")
+                depth_image = depth_estimator(PIL.Image.fromarray(image))["depth"]
+                depth_image = np.array(depth_image)
+                depth_image = depth_image[:, :, None]
+                depth_image = np.concatenate(
+                    [depth_image, depth_image, depth_image], axis=2
+                )
+                control_image = PIL.Image.fromarray(depth_image)
+            else:
+                raise NotImplementedError(
+                    f"{self.sd_controlnet_method} not implemented"
+                )
+            mask_image = PIL.Image.fromarray(mask[:, :, -1], mode="L")
+            image = PIL.Image.fromarray(image)
+            output = self.model(
+                image=image,
+                control_image=control_image,
+                prompt=config.prompt,
+                negative_prompt=config.negative_prompt,
+                mask_image=mask_image,
+                num_inference_steps=config.sd_steps,
+                guidance_scale=config.sd_guidance_scale,
+                output_type="np.array",
+                callback=self.callback,
+                height=img_h,
+                width=img_w,
+                generator=torch.manual_seed(config.sd_seed),
+                controlnet_conditioning_scale=config.controlnet_conditioning_scale,
+            ).images[0]
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
+    def forward_post_process(self, result, image, mask, config):
+        if config.sd_match_histograms:
+            result = self._match_histograms(result, image[:, :, ::-1], mask)
+        if config.sd_mask_blur != 0:
+            k = 2 * config.sd_mask_blur + 1
+            mask = cv2.GaussianBlur(mask, (k, k), 0)
+        return result, image, mask
+    @staticmethod
+    def is_downloaded() -> bool:
+        # model will be downloaded when app start, and can't switch in frontend settings
+        return True

lama_cleaner/model/ddim_sampler.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+from lama_cleaner.model.utils import make_ddim_timesteps, make_ddim_sampling_parameters, noise_like
+from loguru import logger
+class DDIMSampler(object):
+    def __init__(self, model, schedule="linear"):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+    def register_buffer(self, name, attr):
+        setattr(self, name, attr)
+    def make_schedule(
+        self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True
+    ):
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            # array([1])
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose,
+        )
+        alphas_cumprod = self.model.alphas_cumprod  # torch.Size([1000])
+        assert (
+                alphas_cumprod.shape[0] == self.ddpm_num_timesteps
+        ), "alphas have to be defined for each timestep"
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+        self.register_buffer("betas", to_torch(self.model.betas))
+        self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
+        self.register_buffer(
+            "alphas_cumprod_prev", to_torch(self.model.alphas_cumprod_prev)
+        )
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer(
+            "sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod",
+            to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())),
+        )
+        self.register_buffer(
+            "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod.cpu()))
+        )
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod",
+            to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)),
+        )
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,
+            verbose=verbose,
+        )
+        self.register_buffer("ddim_sigmas", ddim_sigmas)
+        self.register_buffer("ddim_alphas", ddim_alphas)
+        self.register_buffer("ddim_alphas_prev", ddim_alphas_prev)
+        self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev)
+            / (1 - self.alphas_cumprod)
+            * (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
+        )
+        self.register_buffer(
+            "ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps
+        )
+    @torch.no_grad()
+    def sample(self, steps, conditioning, batch_size, shape):
+        self.make_schedule(ddim_num_steps=steps, ddim_eta=0, verbose=False)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        # samples: 1,3,128,128
+        return self.ddim_sampling(
+            conditioning,
+            size,
+            quantize_denoised=False,
+            ddim_use_original_steps=False,
+            noise_dropout=0,
+            temperature=1.0,
+        )
+    @torch.no_grad()
+    def ddim_sampling(
+        self,
+        cond,
+        shape,
+        ddim_use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+    ):
+        device = self.model.betas.device
+        b = shape[0]
+        img = torch.randn(shape, device=device, dtype=cond.dtype)
+        timesteps = (
+            self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        )
+        time_range = (
+            reversed(range(0, timesteps))
+            if ddim_use_original_steps
+            else np.flip(timesteps)
+        )
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        logger.info(f"Running DDIM Sampling with {total_steps} timesteps")
+        iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            outs = self.p_sample_ddim(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+            )
+            img, _ = outs
+        return img
+    @torch.no_grad()
+    def p_sample_ddim(
+        self,
+        x,
+        c,
+        t,
+        index,
+        repeat_noise=False,
+        use_original_steps=False,
+        quantize_denoised=False,
+        temperature=1.0,
+        noise_dropout=0.0,
+    ):
+        b, *_, device = *x.shape, x.device
+        e_t = self.model.apply_model(x, t, c)
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = (
+            self.model.alphas_cumprod_prev
+            if use_original_steps
+            else self.ddim_alphas_prev
+        )
+        sqrt_one_minus_alphas = (
+            self.model.sqrt_one_minus_alphas_cumprod
+            if use_original_steps
+            else self.ddim_sqrt_one_minus_alphas
+        )
+        sigmas = (
+            self.model.ddim_sigmas_for_original_num_steps
+            if use_original_steps
+            else self.ddim_sigmas
+        )
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full(
+            (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device
+        )
+        # current prediction for x_0
+        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        if quantize_denoised:  # 没用
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        # direction pointing to x_t
+        dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t
+        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.0:  # 没用
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev, pred_x0

lama_cleaner/model/fcf.py ADDED Viewed

	@@ -0,0 +1,1733 @@

+import os
+import random
+import cv2
+import torch
+import numpy as np
+import torch.fft as fft
+from lama_cleaner.schema import Config
+from lama_cleaner.helper import (
+    load_model,
+    get_cache_path_by_url,
+    norm_img,
+    boxes_from_mask,
+    resize_max_size,
+)
+from lama_cleaner.model.base import InpaintModel
+from torch import conv2d, nn
+import torch.nn.functional as F
+from lama_cleaner.model.utils import (
+    setup_filter,
+    _parse_scaling,
+    _parse_padding,
+    Conv2dLayer,
+    FullyConnectedLayer,
+    MinibatchStdLayer,
+    activation_funcs,
+    conv2d_resample,
+    bias_act,
+    upsample2d,
+    normalize_2nd_moment,
+    downsample2d,
+)
+def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl="cuda"):
+    assert isinstance(x, torch.Tensor)
+    return _upfirdn2d_ref(
+        x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain
+    )
+def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1):
+    """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops."""
+    # Validate arguments.
+    assert isinstance(x, torch.Tensor) and x.ndim == 4
+    if f is None:
+        f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+    assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+    assert f.dtype == torch.float32 and not f.requires_grad
+    batch_size, num_channels, in_height, in_width = x.shape
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    # Upsample by inserting zeros.
+    x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
+    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
+    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
+    # Pad or crop.
+    x = torch.nn.functional.pad(
+        x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)]
+    )
+    x = x[
+        :,
+        :,
+        max(-pady0, 0) : x.shape[2] - max(-pady1, 0),
+        max(-padx0, 0) : x.shape[3] - max(-padx1, 0),
+    ]
+    # Setup filter.
+    f = f * (gain ** (f.ndim / 2))
+    f = f.to(x.dtype)
+    if not flip_filter:
+        f = f.flip(list(range(f.ndim)))
+    # Convolve with the filter.
+    f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim)
+    if f.ndim == 4:
+        x = conv2d(input=x, weight=f, groups=num_channels)
+    else:
+        x = conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels)
+        x = conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels)
+    # Downsample by throwing away pixels.
+    x = x[:, :, ::downy, ::downx]
+    return x
+class EncoderEpilogue(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels.
+        cmap_dim,  # Dimensionality of mapped conditioning label, 0 = no label.
+        z_dim,  # Output Latent (Z) dimensionality.
+        resolution,  # Resolution of this block.
+        img_channels,  # Number of input color channels.
+        architecture="resnet",  # Architecture: 'orig', 'skip', 'resnet'.
+        mbstd_group_size=4,  # Group size for the minibatch standard deviation layer, None = entire minibatch.
+        mbstd_num_channels=1,  # Number of features for the minibatch standard deviation layer, 0 = disable.
+        activation="lrelu",  # Activation function: 'relu', 'lrelu', etc.
+        conv_clamp=None,  # Clamp the output of convolution layers to +-X, None = disable clamping.
+    ):
+        assert architecture in ["orig", "skip", "resnet"]
+        super().__init__()
+        self.in_channels = in_channels
+        self.cmap_dim = cmap_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.architecture = architecture
+        if architecture == "skip":
+            self.fromrgb = Conv2dLayer(
+                self.img_channels, in_channels, kernel_size=1, activation=activation
+            )
+        self.mbstd = (
+            MinibatchStdLayer(
+                group_size=mbstd_group_size, num_channels=mbstd_num_channels
+            )
+            if mbstd_num_channels > 0
+            else None
+        )
+        self.conv = Conv2dLayer(
+            in_channels + mbstd_num_channels,
+            in_channels,
+            kernel_size=3,
+            activation=activation,
+            conv_clamp=conv_clamp,
+        )
+        self.fc = FullyConnectedLayer(
+            in_channels * (resolution**2), z_dim, activation=activation
+        )
+        self.dropout = torch.nn.Dropout(p=0.5)
+    def forward(self, x, cmap, force_fp32=False):
+        _ = force_fp32  # unused
+        dtype = torch.float32
+        memory_format = torch.contiguous_format
+        # FromRGB.
+        x = x.to(dtype=dtype, memory_format=memory_format)
+        # Main layers.
+        if self.mbstd is not None:
+            x = self.mbstd(x)
+        const_e = self.conv(x)
+        x = self.fc(const_e.flatten(1))
+        x = self.dropout(x)
+        # Conditioning.
+        if self.cmap_dim > 0:
+            x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim))
+        assert x.dtype == dtype
+        return x, const_e
+class EncoderBlock(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels, 0 = first block.
+        tmp_channels,  # Number of intermediate channels.
+        out_channels,  # Number of output channels.
+        resolution,  # Resolution of this block.
+        img_channels,  # Number of input color channels.
+        first_layer_idx,  # Index of the first layer.
+        architecture="skip",  # Architecture: 'orig', 'skip', 'resnet'.
+        activation="lrelu",  # Activation function: 'relu', 'lrelu', etc.
+        resample_filter=[
+            1,
+            3,
+            3,
+            1,
+        ],  # Low-pass filter to apply when resampling activations.
+        conv_clamp=None,  # Clamp the output of convolution layers to +-X, None = disable clamping.
+        use_fp16=False,  # Use FP16 for this block?
+        fp16_channels_last=False,  # Use channels-last memory format with FP16?
+        freeze_layers=0,  # Freeze-D: Number of layers to freeze.
+    ):
+        assert in_channels in [0, tmp_channels]
+        assert architecture in ["orig", "skip", "resnet"]
+        super().__init__()
+        self.in_channels = in_channels
+        self.resolution = resolution
+        self.img_channels = img_channels + 1
+        self.first_layer_idx = first_layer_idx
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.channels_last = use_fp16 and fp16_channels_last
+        self.register_buffer("resample_filter", setup_filter(resample_filter))
+        self.num_layers = 0
+        def trainable_gen():
+            while True:
+                layer_idx = self.first_layer_idx + self.num_layers
+                trainable = layer_idx >= freeze_layers
+                self.num_layers += 1
+                yield trainable
+        trainable_iter = trainable_gen()
+        if in_channels == 0:
+            self.fromrgb = Conv2dLayer(
+                self.img_channels,
+                tmp_channels,
+                kernel_size=1,
+                activation=activation,
+                trainable=next(trainable_iter),
+                conv_clamp=conv_clamp,
+                channels_last=self.channels_last,
+            )
+        self.conv0 = Conv2dLayer(
+            tmp_channels,
+            tmp_channels,
+            kernel_size=3,
+            activation=activation,
+            trainable=next(trainable_iter),
+            conv_clamp=conv_clamp,
+            channels_last=self.channels_last,
+        )
+        self.conv1 = Conv2dLayer(
+            tmp_channels,
+            out_channels,
+            kernel_size=3,
+            activation=activation,
+            down=2,
+            trainable=next(trainable_iter),
+            resample_filter=resample_filter,
+            conv_clamp=conv_clamp,
+            channels_last=self.channels_last,
+        )
+        if architecture == "resnet":
+            self.skip = Conv2dLayer(
+                tmp_channels,
+                out_channels,
+                kernel_size=1,
+                bias=False,
+                down=2,
+                trainable=next(trainable_iter),
+                resample_filter=resample_filter,
+                channels_last=self.channels_last,
+            )
+    def forward(self, x, img, force_fp32=False):
+        # dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        dtype = torch.float32
+        memory_format = (
+            torch.channels_last
+            if self.channels_last and not force_fp32
+            else torch.contiguous_format
+        )
+        # Input.
+        if x is not None:
+            x = x.to(dtype=dtype, memory_format=memory_format)
+        # FromRGB.
+        if self.in_channels == 0:
+            img = img.to(dtype=dtype, memory_format=memory_format)
+            y = self.fromrgb(img)
+            x = x + y if x is not None else y
+            img = (
+                downsample2d(img, self.resample_filter)
+                if self.architecture == "skip"
+                else None
+            )
+        # Main layers.
+        if self.architecture == "resnet":
+            y = self.skip(x, gain=np.sqrt(0.5))
+            x = self.conv0(x)
+            feat = x.clone()
+            x = self.conv1(x, gain=np.sqrt(0.5))
+            x = y.add_(x)
+        else:
+            x = self.conv0(x)
+            feat = x.clone()
+            x = self.conv1(x)
+        assert x.dtype == dtype
+        return x, img, feat
+class EncoderNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        c_dim,  # Conditioning label (C) dimensionality.
+        z_dim,  # Input latent (Z) dimensionality.
+        img_resolution,  # Input resolution.
+        img_channels,  # Number of input color channels.
+        architecture="orig",  # Architecture: 'orig', 'skip', 'resnet'.
+        channel_base=16384,  # Overall multiplier for the number of channels.
+        channel_max=512,  # Maximum number of channels in any layer.
+        num_fp16_res=0,  # Use FP16 for the N highest resolutions.
+        conv_clamp=None,  # Clamp the output of convolution layers to +-X, None = disable clamping.
+        cmap_dim=None,  # Dimensionality of mapped conditioning label, None = default.
+        block_kwargs={},  # Arguments for DiscriminatorBlock.
+        mapping_kwargs={},  # Arguments for MappingNetwork.
+        epilogue_kwargs={},  # Arguments for EncoderEpilogue.
+    ):
+        super().__init__()
+        self.c_dim = c_dim
+        self.z_dim = z_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [
+            2**i for i in range(self.img_resolution_log2, 2, -1)
+        ]
+        channels_dict = {
+            res: min(channel_base // res, channel_max)
+            for res in self.block_resolutions + [4]
+        }
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+        if cmap_dim is None:
+            cmap_dim = channels_dict[4]
+        if c_dim == 0:
+            cmap_dim = 0
+        common_kwargs = dict(
+            img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp
+        )
+        cur_layer_idx = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res] if res < img_resolution else 0
+            tmp_channels = channels_dict[res]
+            out_channels = channels_dict[res // 2]
+            use_fp16 = res >= fp16_resolution
+            use_fp16 = False
+            block = EncoderBlock(
+                in_channels,
+                tmp_channels,
+                out_channels,
+                resolution=res,
+                first_layer_idx=cur_layer_idx,
+                use_fp16=use_fp16,
+                **block_kwargs,
+                **common_kwargs,
+            )
+            setattr(self, f"b{res}", block)
+            cur_layer_idx += block.num_layers
+        if c_dim > 0:
+            self.mapping = MappingNetwork(
+                z_dim=0,
+                c_dim=c_dim,
+                w_dim=cmap_dim,
+                num_ws=None,
+                w_avg_beta=None,
+                **mapping_kwargs,
+            )
+        self.b4 = EncoderEpilogue(
+            channels_dict[4],
+            cmap_dim=cmap_dim,
+            z_dim=z_dim * 2,
+            resolution=4,
+            **epilogue_kwargs,
+            **common_kwargs,
+        )
+    def forward(self, img, c, **block_kwargs):
+        x = None
+        feats = {}
+        for res in self.block_resolutions:
+            block = getattr(self, f"b{res}")
+            x, img, feat = block(x, img, **block_kwargs)
+            feats[res] = feat
+        cmap = None
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        x, const_e = self.b4(x, cmap)
+        feats[4] = const_e
+        B, _ = x.shape
+        z = torch.zeros(
+            (B, self.z_dim), requires_grad=False, dtype=x.dtype, device=x.device
+        )  ## Noise for Co-Modulation
+        return x, z, feats
+def fma(a, b, c):  # => a * b + c
+    return _FusedMultiplyAdd.apply(a, b, c)
+class _FusedMultiplyAdd(torch.autograd.Function):  # a * b + c
+    @staticmethod
+    def forward(ctx, a, b, c):  # pylint: disable=arguments-differ
+        out = torch.addcmul(c, a, b)
+        ctx.save_for_backward(a, b)
+        ctx.c_shape = c.shape
+        return out
+    @staticmethod
+    def backward(ctx, dout):  # pylint: disable=arguments-differ
+        a, b = ctx.saved_tensors
+        c_shape = ctx.c_shape
+        da = None
+        db = None
+        dc = None
+        if ctx.needs_input_grad[0]:
+            da = _unbroadcast(dout * b, a.shape)
+        if ctx.needs_input_grad[1]:
+            db = _unbroadcast(dout * a, b.shape)
+        if ctx.needs_input_grad[2]:
+            dc = _unbroadcast(dout, c_shape)
+        return da, db, dc
+def _unbroadcast(x, shape):
+    extra_dims = x.ndim - len(shape)
+    assert extra_dims >= 0
+    dim = [
+        i
+        for i in range(x.ndim)
+        if x.shape[i] > 1 and (i < extra_dims or shape[i - extra_dims] == 1)
+    ]
+    if len(dim):
+        x = x.sum(dim=dim, keepdim=True)
+    if extra_dims:
+        x = x.reshape(-1, *x.shape[extra_dims + 1 :])
+    assert x.shape == shape
+    return x
+def modulated_conv2d(
+    x,  # Input tensor of shape [batch_size, in_channels, in_height, in_width].
+    weight,  # Weight tensor of shape [out_channels, in_channels, kernel_height, kernel_width].
+    styles,  # Modulation coefficients of shape [batch_size, in_channels].
+    noise=None,  # Optional noise tensor to add to the output activations.
+    up=1,  # Integer upsampling factor.
+    down=1,  # Integer downsampling factor.
+    padding=0,  # Padding with respect to the upsampled image.
+    resample_filter=None,
+    # Low-pass filter to apply when resampling activations. Must be prepared beforehand by calling upfirdn2d.setup_filter().
+    demodulate=True,  # Apply weight demodulation?
+    flip_weight=True,  # False = convolution, True = correlation (matches torch.nn.functional.conv2d).
+    fused_modconv=True,  # Perform modulation, convolution, and demodulation as a single fused operation?
+):
+    batch_size = x.shape[0]
+    out_channels, in_channels, kh, kw = weight.shape
+    # Pre-normalize inputs to avoid FP16 overflow.
+    if x.dtype == torch.float16 and demodulate:
+        weight = weight * (
+            1
+            / np.sqrt(in_channels * kh * kw)
+            / weight.norm(float("inf"), dim=[1, 2, 3], keepdim=True)
+        )  # max_Ikk
+        styles = styles / styles.norm(float("inf"), dim=1, keepdim=True)  # max_I
+    # Calculate per-sample weights and demodulation coefficients.
+    w = None
+    dcoefs = None
+    if demodulate or fused_modconv:
+        w = weight.unsqueeze(0)  # [NOIkk]
+        w = w * styles.reshape(batch_size, 1, -1, 1, 1)  # [NOIkk]
+    if demodulate:
+        dcoefs = (w.square().sum(dim=[2, 3, 4]) + 1e-8).rsqrt()  # [NO]
+    if demodulate and fused_modconv:
+        w = w * dcoefs.reshape(batch_size, -1, 1, 1, 1)  # [NOIkk]
+    # Execute by scaling the activations before and after the convolution.
+    if not fused_modconv:
+        x = x * styles.to(x.dtype).reshape(batch_size, -1, 1, 1)
+        x = conv2d_resample.conv2d_resample(
+            x=x,
+            w=weight.to(x.dtype),
+            f=resample_filter,
+            up=up,
+            down=down,
+            padding=padding,
+            flip_weight=flip_weight,
+        )
+        if demodulate and noise is not None:
+            x = fma(
+                x, dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1), noise.to(x.dtype)
+            )
+        elif demodulate:
+            x = x * dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1)
+        elif noise is not None:
+            x = x.add_(noise.to(x.dtype))
+        return x
+    # Execute as one fused op using grouped convolution.
+    batch_size = int(batch_size)
+    x = x.reshape(1, -1, *x.shape[2:])
+    w = w.reshape(-1, in_channels, kh, kw)
+    x = conv2d_resample(
+        x=x,
+        w=w.to(x.dtype),
+        f=resample_filter,
+        up=up,
+        down=down,
+        padding=padding,
+        groups=batch_size,
+        flip_weight=flip_weight,
+    )
+    x = x.reshape(batch_size, -1, *x.shape[2:])
+    if noise is not None:
+        x = x.add_(noise)
+    return x
+class SynthesisLayer(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels.
+        out_channels,  # Number of output channels.
+        w_dim,  # Intermediate latent (W) dimensionality.
+        resolution,  # Resolution of this layer.
+        kernel_size=3,  # Convolution kernel size.
+        up=1,  # Integer upsampling factor.
+        use_noise=True,  # Enable noise input?
+        activation="lrelu",  # Activation function: 'relu', 'lrelu', etc.
+        resample_filter=[
+            1,
+            3,
+            3,
+            1,
+        ],  # Low-pass filter to apply when resampling activations.
+        conv_clamp=None,  # Clamp the output of convolution layers to +-X, None = disable clamping.
+        channels_last=False,  # Use channels_last format for the weights?
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.up = up
+        self.use_noise = use_noise
+        self.activation = activation
+        self.conv_clamp = conv_clamp
+        self.register_buffer("resample_filter", setup_filter(resample_filter))
+        self.padding = kernel_size // 2
+        self.act_gain = activation_funcs[activation].def_gain
+        self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+        memory_format = (
+            torch.channels_last if channels_last else torch.contiguous_format
+        )
+        self.weight = torch.nn.Parameter(
+            torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(
+                memory_format=memory_format
+            )
+        )
+        if use_noise:
+            self.register_buffer("noise_const", torch.randn([resolution, resolution]))
+            self.noise_strength = torch.nn.Parameter(torch.zeros([]))
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+    def forward(self, x, w, noise_mode="none", fused_modconv=True, gain=1):
+        assert noise_mode in ["random", "const", "none"]
+        in_resolution = self.resolution // self.up
+        styles = self.affine(w)
+        noise = None
+        if self.use_noise and noise_mode == "random":
+            noise = (
+                torch.randn(
+                    [x.shape[0], 1, self.resolution, self.resolution], device=x.device
+                )
+                * self.noise_strength
+            )
+        if self.use_noise and noise_mode == "const":
+            noise = self.noise_const * self.noise_strength
+        flip_weight = self.up == 1  # slightly faster
+        x = modulated_conv2d(
+            x=x,
+            weight=self.weight,
+            styles=styles,
+            noise=noise,
+            up=self.up,
+            padding=self.padding,
+            resample_filter=self.resample_filter,
+            flip_weight=flip_weight,
+            fused_modconv=fused_modconv,
+        )
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        x = F.leaky_relu(x, negative_slope=0.2, inplace=False)
+        if act_gain != 1:
+            x = x * act_gain
+        if act_clamp is not None:
+            x = x.clamp(-act_clamp, act_clamp)
+        return x
+class ToRGBLayer(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        w_dim,
+        kernel_size=1,
+        conv_clamp=None,
+        channels_last=False,
+    ):
+        super().__init__()
+        self.conv_clamp = conv_clamp
+        self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+        memory_format = (
+            torch.channels_last if channels_last else torch.contiguous_format
+        )
+        self.weight = torch.nn.Parameter(
+            torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(
+                memory_format=memory_format
+            )
+        )
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size**2))
+    def forward(self, x, w, fused_modconv=True):
+        styles = self.affine(w) * self.weight_gain
+        x = modulated_conv2d(
+            x=x,
+            weight=self.weight,
+            styles=styles,
+            demodulate=False,
+            fused_modconv=fused_modconv,
+        )
+        x = bias_act(x, self.bias.to(x.dtype), clamp=self.conv_clamp)
+        return x
+class SynthesisForeword(torch.nn.Module):
+    def __init__(
+        self,
+        z_dim,  # Output Latent (Z) dimensionality.
+        resolution,  # Resolution of this block.
+        in_channels,
+        img_channels,  # Number of input color channels.
+        architecture="skip",  # Architecture: 'orig', 'skip', 'resnet'.
+        activation="lrelu",  # Activation function: 'relu', 'lrelu', etc.
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.z_dim = z_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.architecture = architecture
+        self.fc = FullyConnectedLayer(
+            self.z_dim, (self.z_dim // 2) * 4 * 4, activation=activation
+        )
+        self.conv = SynthesisLayer(
+            self.in_channels, self.in_channels, w_dim=(z_dim // 2) * 3, resolution=4
+        )
+        if architecture == "skip":
+            self.torgb = ToRGBLayer(
+                self.in_channels,
+                self.img_channels,
+                kernel_size=1,
+                w_dim=(z_dim // 2) * 3,
+            )
+    def forward(self, x, ws, feats, img, force_fp32=False):
+        _ = force_fp32  # unused
+        dtype = torch.float32
+        memory_format = torch.contiguous_format
+        x_global = x.clone()
+        # ToRGB.
+        x = self.fc(x)
+        x = x.view(-1, self.z_dim // 2, 4, 4)
+        x = x.to(dtype=dtype, memory_format=memory_format)
+        # Main layers.
+        x_skip = feats[4].clone()
+        x = x + x_skip
+        mod_vector = []
+        mod_vector.append(ws[:, 0])
+        mod_vector.append(x_global.clone())
+        mod_vector = torch.cat(mod_vector, dim=1)
+        x = self.conv(x, mod_vector)
+        mod_vector = []
+        mod_vector.append(ws[:, 2 * 2 - 3])
+        mod_vector.append(x_global.clone())
+        mod_vector = torch.cat(mod_vector, dim=1)
+        if self.architecture == "skip":
+            img = self.torgb(x, mod_vector)
+            img = img.to(dtype=torch.float32, memory_format=torch.contiguous_format)
+        assert x.dtype == dtype
+        return x, img
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=False),
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.Sigmoid(),
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        res = x * y.expand_as(x)
+        return res
+class FourierUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        groups=1,
+        spatial_scale_factor=None,
+        spatial_scale_mode="bilinear",
+        spectral_pos_encoding=False,
+        use_se=False,
+        se_kwargs=None,
+        ffc3d=False,
+        fft_norm="ortho",
+    ):
+        # bn_layer not used
+        super(FourierUnit, self).__init__()
+        self.groups = groups
+        self.conv_layer = torch.nn.Conv2d(
+            in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0),
+            out_channels=out_channels * 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=self.groups,
+            bias=False,
+        )
+        self.relu = torch.nn.ReLU(inplace=False)
+        # squeeze and excitation block
+        self.use_se = use_se
+        if use_se:
+            if se_kwargs is None:
+                se_kwargs = {}
+            self.se = SELayer(self.conv_layer.in_channels, **se_kwargs)
+        self.spatial_scale_factor = spatial_scale_factor
+        self.spatial_scale_mode = spatial_scale_mode
+        self.spectral_pos_encoding = spectral_pos_encoding
+        self.ffc3d = ffc3d
+        self.fft_norm = fft_norm
+    def forward(self, x):
+        batch = x.shape[0]
+        if self.spatial_scale_factor is not None:
+            orig_size = x.shape[-2:]
+            x = F.interpolate(
+                x,
+                scale_factor=self.spatial_scale_factor,
+                mode=self.spatial_scale_mode,
+                align_corners=False,
+            )
+        r_size = x.size()
+        # (batch, c, h, w/2+1, 2)
+        fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1)
+        ffted = fft.rfftn(x, dim=fft_dim, norm=self.fft_norm)
+        ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
+        ffted = ffted.permute(0, 1, 4, 2, 3).contiguous()  # (batch, c, 2, h, w/2+1)
+        ffted = ffted.view(
+            (
+                batch,
+                -1,
+            )
+            + ffted.size()[3:]
+        )
+        if self.spectral_pos_encoding:
+            height, width = ffted.shape[-2:]
+            coords_vert = (
+                torch.linspace(0, 1, height)[None, None, :, None]
+                .expand(batch, 1, height, width)
+                .to(ffted)
+            )
+            coords_hor = (
+                torch.linspace(0, 1, width)[None, None, None, :]
+                .expand(batch, 1, height, width)
+                .to(ffted)
+            )
+            ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1)
+        if self.use_se:
+            ffted = self.se(ffted)
+        ffted = self.conv_layer(ffted)  # (batch, c*2, h, w/2+1)
+        ffted = self.relu(ffted)
+        ffted = (
+            ffted.view(
+                (
+                    batch,
+                    -1,
+                    2,
+                )
+                + ffted.size()[2:]
+            )
+            .permute(0, 1, 3, 4, 2)
+            .contiguous()
+        )  # (batch,c, t, h, w/2+1, 2)
+        ffted = torch.complex(ffted[..., 0], ffted[..., 1])
+        ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:]
+        output = torch.fft.irfftn(
+            ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm
+        )
+        if self.spatial_scale_factor is not None:
+            output = F.interpolate(
+                output,
+                size=orig_size,
+                mode=self.spatial_scale_mode,
+                align_corners=False,
+            )
+        return output
+class SpectralTransform(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride=1,
+        groups=1,
+        enable_lfu=True,
+        **fu_kwargs,
+    ):
+        # bn_layer not used
+        super(SpectralTransform, self).__init__()
+        self.enable_lfu = enable_lfu
+        if stride == 2:
+            self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
+        else:
+            self.downsample = nn.Identity()
+        self.stride = stride
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(
+                in_channels, out_channels // 2, kernel_size=1, groups=groups, bias=False
+            ),
+            # nn.BatchNorm2d(out_channels // 2),
+            nn.ReLU(inplace=True),
+        )
+        self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups, **fu_kwargs)
+        if self.enable_lfu:
+            self.lfu = FourierUnit(out_channels // 2, out_channels // 2, groups)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2, out_channels, kernel_size=1, groups=groups, bias=False
+        )
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.conv1(x)
+        output = self.fu(x)
+        if self.enable_lfu:
+            n, c, h, w = x.shape
+            split_no = 2
+            split_s = h // split_no
+            xs = torch.cat(
+                torch.split(x[:, : c // 4], split_s, dim=-2), dim=1
+            ).contiguous()
+            xs = torch.cat(torch.split(xs, split_s, dim=-1), dim=1).contiguous()
+            xs = self.lfu(xs)
+            xs = xs.repeat(1, 1, split_no, split_no).contiguous()
+        else:
+            xs = 0
+        output = self.conv2(x + output + xs)
+        return output
+class FFC(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        ratio_gin,
+        ratio_gout,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+        enable_lfu=True,
+        padding_type="reflect",
+        gated=False,
+        **spectral_kwargs,
+    ):
+        super(FFC, self).__init__()
+        assert stride == 1 or stride == 2, "Stride should be 1 or 2."
+        self.stride = stride
+        in_cg = int(in_channels * ratio_gin)
+        in_cl = in_channels - in_cg
+        out_cg = int(out_channels * ratio_gout)
+        out_cl = out_channels - out_cg
+        # groups_g = 1 if groups == 1 else int(groups * ratio_gout)
+        # groups_l = 1 if groups == 1 else groups - groups_g
+        self.ratio_gin = ratio_gin
+        self.ratio_gout = ratio_gout
+        self.global_in_num = in_cg
+        module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
+        self.convl2l = module(
+            in_cl,
+            out_cl,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type,
+        )
+        module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
+        self.convl2g = module(
+            in_cl,
+            out_cg,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type,
+        )
+        module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
+        self.convg2l = module(
+            in_cg,
+            out_cl,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode=padding_type,
+        )
+        module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
+        self.convg2g = module(
+            in_cg,
+            out_cg,
+            stride,
+            1 if groups == 1 else groups // 2,
+            enable_lfu,
+            **spectral_kwargs,
+        )
+        self.gated = gated
+        module = (
+            nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
+        )
+        self.gate = module(in_channels, 2, 1)
+    def forward(self, x, fname=None):
+        x_l, x_g = x if type(x) is tuple else (x, 0)
+        out_xl, out_xg = 0, 0
+        if self.gated:
+            total_input_parts = [x_l]
+            if torch.is_tensor(x_g):
+                total_input_parts.append(x_g)
+            total_input = torch.cat(total_input_parts, dim=1)
+            gates = torch.sigmoid(self.gate(total_input))
+            g2l_gate, l2g_gate = gates.chunk(2, dim=1)
+        else:
+            g2l_gate, l2g_gate = 1, 1
+        spec_x = self.convg2g(x_g)
+        if self.ratio_gout != 1:
+            out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
+        if self.ratio_gout != 0:
+            out_xg = self.convl2g(x_l) * l2g_gate + spec_x
+        return out_xl, out_xg
+class FFC_BN_ACT(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        ratio_gin,
+        ratio_gout,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=False,
+        norm_layer=nn.SyncBatchNorm,
+        activation_layer=nn.Identity,
+        padding_type="reflect",
+        enable_lfu=True,
+        **kwargs,
+    ):
+        super(FFC_BN_ACT, self).__init__()
+        self.ffc = FFC(
+            in_channels,
+            out_channels,
+            kernel_size,
+            ratio_gin,
+            ratio_gout,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            enable_lfu,
+            padding_type=padding_type,
+            **kwargs,
+        )
+        lnorm = nn.Identity if ratio_gout == 1 else norm_layer
+        gnorm = nn.Identity if ratio_gout == 0 else norm_layer
+        global_channels = int(out_channels * ratio_gout)
+        # self.bn_l = lnorm(out_channels - global_channels)
+        # self.bn_g = gnorm(global_channels)
+        lact = nn.Identity if ratio_gout == 1 else activation_layer
+        gact = nn.Identity if ratio_gout == 0 else activation_layer
+        self.act_l = lact(inplace=True)
+        self.act_g = gact(inplace=True)
+    def forward(self, x, fname=None):
+        x_l, x_g = self.ffc(
+            x,
+            fname=fname,
+        )
+        x_l = self.act_l(x_l)
+        x_g = self.act_g(x_g)
+        return x_l, x_g
+class FFCResnetBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        padding_type,
+        norm_layer,
+        activation_layer=nn.ReLU,
+        dilation=1,
+        spatial_transform_kwargs=None,
+        inline=False,
+        ratio_gin=0.75,
+        ratio_gout=0.75,
+    ):
+        super().__init__()
+        self.conv1 = FFC_BN_ACT(
+            dim,
+            dim,
+            kernel_size=3,
+            padding=dilation,
+            dilation=dilation,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer,
+            padding_type=padding_type,
+            ratio_gin=ratio_gin,
+            ratio_gout=ratio_gout,
+        )
+        self.conv2 = FFC_BN_ACT(
+            dim,
+            dim,
+            kernel_size=3,
+            padding=dilation,
+            dilation=dilation,
+            norm_layer=norm_layer,
+            activation_layer=activation_layer,
+            padding_type=padding_type,
+            ratio_gin=ratio_gin,
+            ratio_gout=ratio_gout,
+        )
+        self.inline = inline
+    def forward(self, x, fname=None):
+        if self.inline:
+            x_l, x_g = (
+                x[:, : -self.conv1.ffc.global_in_num],
+                x[:, -self.conv1.ffc.global_in_num :],
+            )
+        else:
+            x_l, x_g = x if type(x) is tuple else (x, 0)
+        id_l, id_g = x_l, x_g
+        x_l, x_g = self.conv1((x_l, x_g), fname=fname)
+        x_l, x_g = self.conv2((x_l, x_g), fname=fname)
+        x_l, x_g = id_l + x_l, id_g + x_g
+        out = x_l, x_g
+        if self.inline:
+            out = torch.cat(out, dim=1)
+        return out
+class ConcatTupleLayer(nn.Module):
+    def forward(self, x):
+        assert isinstance(x, tuple)
+        x_l, x_g = x
+        assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
+        if not torch.is_tensor(x_g):
+            return x_l
+        return torch.cat(x, dim=1)
+class FFCBlock(torch.nn.Module):
+    def __init__(
+        self,
+        dim,  # Number of output/input channels.
+        kernel_size,  # Width and height of the convolution kernel.
+        padding,
+        ratio_gin=0.75,
+        ratio_gout=0.75,
+        activation="linear",  # Activation function: 'relu', 'lrelu', etc.
+    ):
+        super().__init__()
+        if activation == "linear":
+            self.activation = nn.Identity
+        else:
+            self.activation = nn.ReLU
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.ffc_block = FFCResnetBlock(
+            dim=dim,
+            padding_type="reflect",
+            norm_layer=nn.SyncBatchNorm,
+            activation_layer=self.activation,
+            dilation=1,
+            ratio_gin=ratio_gin,
+            ratio_gout=ratio_gout,
+        )
+        self.concat_layer = ConcatTupleLayer()
+    def forward(self, gen_ft, mask, fname=None):
+        x = gen_ft.float()
+        x_l, x_g = (
+            x[:, : -self.ffc_block.conv1.ffc.global_in_num],
+            x[:, -self.ffc_block.conv1.ffc.global_in_num :],
+        )
+        id_l, id_g = x_l, x_g
+        x_l, x_g = self.ffc_block((x_l, x_g), fname=fname)
+        x_l, x_g = id_l + x_l, id_g + x_g
+        x = self.concat_layer((x_l, x_g))
+        return x + gen_ft.float()
+class FFCSkipLayer(torch.nn.Module):
+    def __init__(
+        self,
+        dim,  # Number of input/output channels.
+        kernel_size=3,  # Convolution kernel size.
+        ratio_gin=0.75,
+        ratio_gout=0.75,
+    ):
+        super().__init__()
+        self.padding = kernel_size // 2
+        self.ffc_act = FFCBlock(
+            dim=dim,
+            kernel_size=kernel_size,
+            activation=nn.ReLU,
+            padding=self.padding,
+            ratio_gin=ratio_gin,
+            ratio_gout=ratio_gout,
+        )
+    def forward(self, gen_ft, mask, fname=None):
+        x = self.ffc_act(gen_ft, mask, fname=fname)
+        return x
+class SynthesisBlock(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels, 0 = first block.
+        out_channels,  # Number of output channels.
+        w_dim,  # Intermediate latent (W) dimensionality.
+        resolution,  # Resolution of this block.
+        img_channels,  # Number of output color channels.
+        is_last,  # Is this the last block?
+        architecture="skip",  # Architecture: 'orig', 'skip', 'resnet'.
+        resample_filter=[
+            1,
+            3,
+            3,
+            1,
+        ],  # Low-pass filter to apply when resampling activations.
+        conv_clamp=None,  # Clamp the output of convolution layers to +-X, None = disable clamping.
+        use_fp16=False,  # Use FP16 for this block?
+        fp16_channels_last=False,  # Use channels-last memory format with FP16?
+        **layer_kwargs,  # Arguments for SynthesisLayer.
+    ):
+        assert architecture in ["orig", "skip", "resnet"]
+        super().__init__()
+        self.in_channels = in_channels
+        self.w_dim = w_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.is_last = is_last
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.channels_last = use_fp16 and fp16_channels_last
+        self.register_buffer("resample_filter", setup_filter(resample_filter))
+        self.num_conv = 0
+        self.num_torgb = 0
+        self.res_ffc = {4: 0, 8: 0, 16: 0, 32: 1, 64: 1, 128: 1, 256: 1, 512: 1}
+        if in_channels != 0 and resolution >= 8:
+            self.ffc_skip = nn.ModuleList()
+            for _ in range(self.res_ffc[resolution]):
+                self.ffc_skip.append(FFCSkipLayer(dim=out_channels))
+        if in_channels == 0:
+            self.const = torch.nn.Parameter(
+                torch.randn([out_channels, resolution, resolution])
+            )
+        if in_channels != 0:
+            self.conv0 = SynthesisLayer(
+                in_channels,
+                out_channels,
+                w_dim=w_dim * 3,
+                resolution=resolution,
+                up=2,
+                resample_filter=resample_filter,
+                conv_clamp=conv_clamp,
+                channels_last=self.channels_last,
+                **layer_kwargs,
+            )
+            self.num_conv += 1
+        self.conv1 = SynthesisLayer(
+            out_channels,
+            out_channels,
+            w_dim=w_dim * 3,
+            resolution=resolution,
+            conv_clamp=conv_clamp,
+            channels_last=self.channels_last,
+            **layer_kwargs,
+        )
+        self.num_conv += 1
+        if is_last or architecture == "skip":
+            self.torgb = ToRGBLayer(
+                out_channels,
+                img_channels,
+                w_dim=w_dim * 3,
+                conv_clamp=conv_clamp,
+                channels_last=self.channels_last,
+            )
+            self.num_torgb += 1
+        if in_channels != 0 and architecture == "resnet":
+            self.skip = Conv2dLayer(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                bias=False,
+                up=2,
+                resample_filter=resample_filter,
+                channels_last=self.channels_last,
+            )
+    def forward(
+        self,
+        x,
+        mask,
+        feats,
+        img,
+        ws,
+        fname=None,
+        force_fp32=False,
+        fused_modconv=None,
+        **layer_kwargs,
+    ):
+        dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        dtype = torch.float32
+        memory_format = (
+            torch.channels_last
+            if self.channels_last and not force_fp32
+            else torch.contiguous_format
+        )
+        if fused_modconv is None:
+            fused_modconv = (not self.training) and (
+                dtype == torch.float32 or int(x.shape[0]) == 1
+            )
+        x = x.to(dtype=dtype, memory_format=memory_format)
+        x_skip = (
+            feats[self.resolution].clone().to(dtype=dtype, memory_format=memory_format)
+        )
+        # Main layers.
+        if self.in_channels == 0:
+            x = self.conv1(x, ws[1], fused_modconv=fused_modconv, **layer_kwargs)
+        elif self.architecture == "resnet":
+            y = self.skip(x, gain=np.sqrt(0.5))
+            x = self.conv0(
+                x, ws[0].clone(), fused_modconv=fused_modconv, **layer_kwargs
+            )
+            if len(self.ffc_skip) > 0:
+                mask = F.interpolate(
+                    mask,
+                    size=x_skip.shape[2:],
+                )
+                z = x + x_skip
+                for fres in self.ffc_skip:
+                    z = fres(z, mask)
+                x = x + z
+            else:
+                x = x + x_skip
+            x = self.conv1(
+                x,
+                ws[1].clone(),
+                fused_modconv=fused_modconv,
+                gain=np.sqrt(0.5),
+                **layer_kwargs,
+            )
+            x = y.add_(x)
+        else:
+            x = self.conv0(
+                x, ws[0].clone(), fused_modconv=fused_modconv, **layer_kwargs
+            )
+            if len(self.ffc_skip) > 0:
+                mask = F.interpolate(
+                    mask,
+                    size=x_skip.shape[2:],
+                )
+                z = x + x_skip
+                for fres in self.ffc_skip:
+                    z = fres(z, mask)
+                x = x + z
+            else:
+                x = x + x_skip
+            x = self.conv1(
+                x, ws[1].clone(), fused_modconv=fused_modconv, **layer_kwargs
+            )
+        # ToRGB.
+        if img is not None:
+            img = upsample2d(img, self.resample_filter)
+        if self.is_last or self.architecture == "skip":
+            y = self.torgb(x, ws[2].clone(), fused_modconv=fused_modconv)
+            y = y.to(dtype=torch.float32, memory_format=torch.contiguous_format)
+            img = img.add_(y) if img is not None else y
+        x = x.to(dtype=dtype)
+        assert x.dtype == dtype
+        assert img is None or img.dtype == torch.float32
+        return x, img
+class SynthesisNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        w_dim,  # Intermediate latent (W) dimensionality.
+        z_dim,  # Output Latent (Z) dimensionality.
+        img_resolution,  # Output image resolution.
+        img_channels,  # Number of color channels.
+        channel_base=16384,  # Overall multiplier for the number of channels.
+        channel_max=512,  # Maximum number of channels in any layer.
+        num_fp16_res=0,  # Use FP16 for the N highest resolutions.
+        **block_kwargs,  # Arguments for SynthesisBlock.
+    ):
+        assert img_resolution >= 4 and img_resolution & (img_resolution - 1) == 0
+        super().__init__()
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.block_resolutions = [
+            2**i for i in range(3, self.img_resolution_log2 + 1)
+        ]
+        channels_dict = {
+            res: min(channel_base // res, channel_max) for res in self.block_resolutions
+        }
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+        self.foreword = SynthesisForeword(
+            img_channels=img_channels,
+            in_channels=min(channel_base // 4, channel_max),
+            z_dim=z_dim * 2,
+            resolution=4,
+        )
+        self.num_ws = self.img_resolution_log2 * 2 - 2
+        for res in self.block_resolutions:
+            if res // 2 in channels_dict.keys():
+                in_channels = channels_dict[res // 2] if res > 4 else 0
+            else:
+                in_channels = min(channel_base // (res // 2), channel_max)
+            out_channels = channels_dict[res]
+            use_fp16 = res >= fp16_resolution
+            use_fp16 = False
+            is_last = res == self.img_resolution
+            block = SynthesisBlock(
+                in_channels,
+                out_channels,
+                w_dim=w_dim,
+                resolution=res,
+                img_channels=img_channels,
+                is_last=is_last,
+                use_fp16=use_fp16,
+                **block_kwargs,
+            )
+            setattr(self, f"b{res}", block)
+    def forward(self, x_global, mask, feats, ws, fname=None, **block_kwargs):
+        img = None
+        x, img = self.foreword(x_global, ws, feats, img)
+        for res in self.block_resolutions:
+            block = getattr(self, f"b{res}")
+            mod_vector0 = []
+            mod_vector0.append(ws[:, int(np.log2(res)) * 2 - 5])
+            mod_vector0.append(x_global.clone())
+            mod_vector0 = torch.cat(mod_vector0, dim=1)
+            mod_vector1 = []
+            mod_vector1.append(ws[:, int(np.log2(res)) * 2 - 4])
+            mod_vector1.append(x_global.clone())
+            mod_vector1 = torch.cat(mod_vector1, dim=1)
+            mod_vector_rgb = []
+            mod_vector_rgb.append(ws[:, int(np.log2(res)) * 2 - 3])
+            mod_vector_rgb.append(x_global.clone())
+            mod_vector_rgb = torch.cat(mod_vector_rgb, dim=1)
+            x, img = block(
+                x,
+                mask,
+                feats,
+                img,
+                (mod_vector0, mod_vector1, mod_vector_rgb),
+                fname=fname,
+                **block_kwargs,
+            )
+        return img
+class MappingNetwork(torch.nn.Module):
+    def __init__(
+        self,
+        z_dim,  # Input latent (Z) dimensionality, 0 = no latent.
+        c_dim,  # Conditioning label (C) dimensionality, 0 = no label.
+        w_dim,  # Intermediate latent (W) dimensionality.
+        num_ws,  # Number of intermediate latents to output, None = do not broadcast.
+        num_layers=8,  # Number of mapping layers.
+        embed_features=None,  # Label embedding dimensionality, None = same as w_dim.
+        layer_features=None,  # Number of intermediate features in the mapping layers, None = same as w_dim.
+        activation="lrelu",  # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier=0.01,  # Learning rate multiplier for the mapping layers.
+        w_avg_beta=0.995,  # Decay for tracking the moving average of W during training, None = do not track.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.num_ws = num_ws
+        self.num_layers = num_layers
+        self.w_avg_beta = w_avg_beta
+        if embed_features is None:
+            embed_features = w_dim
+        if c_dim == 0:
+            embed_features = 0
+        if layer_features is None:
+            layer_features = w_dim
+        features_list = (
+            [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim]
+        )
+        if c_dim > 0:
+            self.embed = FullyConnectedLayer(c_dim, embed_features)
+        for idx in range(num_layers):
+            in_features = features_list[idx]
+            out_features = features_list[idx + 1]
+            layer = FullyConnectedLayer(
+                in_features,
+                out_features,
+                activation=activation,
+                lr_multiplier=lr_multiplier,
+            )
+            setattr(self, f"fc{idx}", layer)
+        if num_ws is not None and w_avg_beta is not None:
+            self.register_buffer("w_avg", torch.zeros([w_dim]))
+    def forward(
+        self, z, c, truncation_psi=1, truncation_cutoff=None, skip_w_avg_update=False
+    ):
+        # Embed, normalize, and concat inputs.
+        x = None
+        with torch.autograd.profiler.record_function("input"):
+            if self.z_dim > 0:
+                x = normalize_2nd_moment(z.to(torch.float32))
+            if self.c_dim > 0:
+                y = normalize_2nd_moment(self.embed(c.to(torch.float32)))
+                x = torch.cat([x, y], dim=1) if x is not None else y
+        # Main layers.
+        for idx in range(self.num_layers):
+            layer = getattr(self, f"fc{idx}")
+            x = layer(x)
+        # Update moving average of W.
+        if self.w_avg_beta is not None and self.training and not skip_w_avg_update:
+            with torch.autograd.profiler.record_function("update_w_avg"):
+                self.w_avg.copy_(
+                    x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta)
+                )
+        # Broadcast.
+        if self.num_ws is not None:
+            with torch.autograd.profiler.record_function("broadcast"):
+                x = x.unsqueeze(1).repeat([1, self.num_ws, 1])
+        # Apply truncation.
+        if truncation_psi != 1:
+            with torch.autograd.profiler.record_function("truncate"):
+                assert self.w_avg_beta is not None
+                if self.num_ws is None or truncation_cutoff is None:
+                    x = self.w_avg.lerp(x, truncation_psi)
+                else:
+                    x[:, :truncation_cutoff] = self.w_avg.lerp(
+                        x[:, :truncation_cutoff], truncation_psi
+                    )
+        return x
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        z_dim,  # Input latent (Z) dimensionality.
+        c_dim,  # Conditioning label (C) dimensionality.
+        w_dim,  # Intermediate latent (W) dimensionality.
+        img_resolution,  # Output resolution.
+        img_channels,  # Number of output color channels.
+        encoder_kwargs={},  # Arguments for EncoderNetwork.
+        mapping_kwargs={},  # Arguments for MappingNetwork.
+        synthesis_kwargs={},  # Arguments for SynthesisNetwork.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        self.encoder = EncoderNetwork(
+            c_dim=c_dim,
+            z_dim=z_dim,
+            img_resolution=img_resolution,
+            img_channels=img_channels,
+            **encoder_kwargs,
+        )
+        self.synthesis = SynthesisNetwork(
+            z_dim=z_dim,
+            w_dim=w_dim,
+            img_resolution=img_resolution,
+            img_channels=img_channels,
+            **synthesis_kwargs,
+        )
+        self.num_ws = self.synthesis.num_ws
+        self.mapping = MappingNetwork(
+            z_dim=z_dim, c_dim=c_dim, w_dim=w_dim, num_ws=self.num_ws, **mapping_kwargs
+        )
+    def forward(
+        self,
+        img,
+        c,
+        fname=None,
+        truncation_psi=1,
+        truncation_cutoff=None,
+        **synthesis_kwargs,
+    ):
+        mask = img[:, -1].unsqueeze(1)
+        x_global, z, feats = self.encoder(img, c)
+        ws = self.mapping(
+            z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff
+        )
+        img = self.synthesis(x_global, mask, feats, ws, fname=fname, **synthesis_kwargs)
+        return img
+FCF_MODEL_URL = os.environ.get(
+    "FCF_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_fcf/places_512_G.pth",
+)
+FCF_MODEL_MD5 = os.environ.get("FCF_MODEL_MD5", "3323152bc01bf1c56fd8aba74435a211")
+class FcF(InpaintModel):
+    name = "fcf"
+    min_size = 512
+    pad_mod = 512
+    pad_to_square = True
+    def init_model(self, device, **kwargs):
+        seed = 0
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        kwargs = {
+            "channel_base": 1 * 32768,
+            "channel_max": 512,
+            "num_fp16_res": 4,
+            "conv_clamp": 256,
+        }
+        G = Generator(
+            z_dim=512,
+            c_dim=0,
+            w_dim=512,
+            img_resolution=512,
+            img_channels=3,
+            synthesis_kwargs=kwargs,
+            encoder_kwargs=kwargs,
+            mapping_kwargs={"num_layers": 2},
+        )
+        self.model = load_model(G, FCF_MODEL_URL, device, FCF_MODEL_MD5)
+        self.label = torch.zeros([1, self.model.c_dim], device=device)
+    @staticmethod
+    def is_downloaded() -> bool:
+        return os.path.exists(get_cache_path_by_url(FCF_MODEL_URL))
+    @torch.no_grad()
+    def __call__(self, image, mask, config: Config):
+        """
+        images: [H, W, C] RGB, not normalized
+        masks: [H, W]
+        return: BGR IMAGE
+        """
+        if image.shape[0] == 512 and image.shape[1] == 512:
+            return self._pad_forward(image, mask, config)
+        boxes = boxes_from_mask(mask)
+        crop_result = []
+        config.hd_strategy_crop_margin = 128
+        for box in boxes:
+            crop_image, crop_mask, crop_box = self._crop_box(image, mask, box, config)
+            origin_size = crop_image.shape[:2]
+            resize_image = resize_max_size(crop_image, size_limit=512)
+            resize_mask = resize_max_size(crop_mask, size_limit=512)
+            inpaint_result = self._pad_forward(resize_image, resize_mask, config)
+            # only paste masked area result
+            inpaint_result = cv2.resize(
+                inpaint_result,
+                (origin_size[1], origin_size[0]),
+                interpolation=cv2.INTER_CUBIC,
+            )
+            original_pixel_indices = crop_mask < 127
+            inpaint_result[original_pixel_indices] = crop_image[:, :, ::-1][
+                original_pixel_indices
+            ]
+            crop_result.append((inpaint_result, crop_box))
+        inpaint_result = image[:, :, ::-1]
+        for crop_image, crop_box in crop_result:
+            x1, y1, x2, y2 = crop_box
+            inpaint_result[y1:y2, x1:x2, :] = crop_image
+        return inpaint_result
+    def forward(self, image, mask, config: Config):
+        """Input images and output images have same size
+        images: [H, W, C] RGB
+        masks: [H, W] mask area == 255
+        return: BGR IMAGE
+        """
+        image = norm_img(image)  # [0, 1]
+        image = image * 2 - 1  # [0, 1] -> [-1, 1]
+        mask = (mask > 120) * 255
+        mask = norm_img(mask)
+        image = torch.from_numpy(image).unsqueeze(0).to(self.device)
+        mask = torch.from_numpy(mask).unsqueeze(0).to(self.device)
+        erased_img = image * (1 - mask)
+        input_image = torch.cat([0.5 - mask, erased_img], dim=1)
+        output = self.model(
+            input_image, self.label, truncation_psi=0.1, noise_mode="none"
+        )
+        output = (
+            (output.permute(0, 2, 3, 1) * 127.5 + 127.5)
+            .round()
+            .clamp(0, 255)
+            .to(torch.uint8)
+        )
+        output = output[0].cpu().numpy()
+        cur_res = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return cur_res

lama_cleaner/model/instruct_pix2pix.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import PIL.Image
+import cv2
+import torch
+from loguru import logger
+from lama_cleaner.model.base import DiffusionInpaintModel
+from lama_cleaner.model.utils import set_seed
+from lama_cleaner.schema import Config
+class InstructPix2Pix(DiffusionInpaintModel):
+    name = "instruct_pix2pix"
+    pad_mod = 8
+    min_size = 512
+    def init_model(self, device: torch.device, **kwargs):
+        from diffusers import StableDiffusionInstructPix2PixPipeline
+        fp16 = not kwargs.get('no_half', False)
+        model_kwargs = {"local_files_only": kwargs.get('local_files_only', False)}
+        if kwargs['disable_nsfw'] or kwargs.get('cpu_offload', False):
+            logger.info("Disable Stable Diffusion Model NSFW checker")
+            model_kwargs.update(dict(
+                safety_checker=None,
+                feature_extractor=None,
+                requires_safety_checker=False
+            ))
+        use_gpu = device == torch.device('cuda') and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+        self.model = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            "timbrooks/instruct-pix2pix",
+            revision="fp16" if use_gpu and fp16 else "main",
+            torch_dtype=torch_dtype,
+            **model_kwargs
+        )
+        self.model.enable_attention_slicing()
+        if kwargs.get('enable_xformers', False):
+            self.model.enable_xformers_memory_efficient_attention()
+        if kwargs.get('cpu_offload', False) and use_gpu:
+            logger.info("Enable sequential cpu offload")
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+    def forward(self, image, mask, config: Config):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        edit = pipe(prompt, image=image, num_inference_steps=20, image_guidance_scale=1.5, guidance_scale=7).images[0]
+        """
+        output = self.model(
+            image=PIL.Image.fromarray(image),
+            prompt=config.prompt,
+            negative_prompt=config.negative_prompt,
+            num_inference_steps=config.p2p_steps,
+            image_guidance_scale=config.p2p_image_guidance_scale,
+            guidance_scale=config.p2p_guidance_scale,
+            output_type="np.array",
+            generator=torch.manual_seed(config.sd_seed)
+        ).images[0]
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
+    #
+    # def forward_post_process(self, result, image, mask, config):
+    #     if config.sd_match_histograms:
+    #         result = self._match_histograms(result, image[:, :, ::-1], mask)
+    #
+    #     if config.sd_mask_blur != 0:
+    #         k = 2 * config.sd_mask_blur + 1
+    #         mask = cv2.GaussianBlur(mask, (k, k), 0)
+    #     return result, image, mask
+    @staticmethod
+    def is_downloaded() -> bool:
+        # model will be downloaded when app start, and can't switch in frontend settings
+        return True

lama_cleaner/model/lama.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import cv2
+import numpy as np
+import torch
+from lama_cleaner.helper import (
+    norm_img,
+    get_cache_path_by_url,
+    load_jit_model,
+)
+from lama_cleaner.model.base import InpaintModel
+from lama_cleaner.schema import Config
+LAMA_MODEL_URL = os.environ.get(
+    "LAMA_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_big_lama/big-lama.pt",
+)
+LAMA_MODEL_MD5 = os.environ.get("LAMA_MODEL_MD5", "e3aa4aaa15225a33ec84f9f4bc47e500")
+class LaMa(InpaintModel):
+    name = "lama"
+    pad_mod = 8
+    def init_model(self, device, **kwargs):
+        self.model = load_jit_model(LAMA_MODEL_URL, device, LAMA_MODEL_MD5).eval()
+    @staticmethod
+    def is_downloaded() -> bool:
+        return os.path.exists(get_cache_path_by_url(LAMA_MODEL_URL))
+    def forward(self, image, mask, config: Config):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W]
+        return: BGR IMAGE
+        """
+        image = norm_img(image)
+        mask = norm_img(mask)
+        mask = (mask > 0) * 1
+        image = torch.from_numpy(image).unsqueeze(0).to(self.device)
+        mask = torch.from_numpy(mask).unsqueeze(0).to(self.device)
+        inpainted_image = self.model(image, mask)
+        cur_res = inpainted_image[0].permute(1, 2, 0).detach().cpu().numpy()
+        cur_res = np.clip(cur_res * 255, 0, 255).astype("uint8")
+        cur_res = cv2.cvtColor(cur_res, cv2.COLOR_RGB2BGR)
+        return cur_res

lama_cleaner/model/ldm.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import os
+from functools import wraps
+import numpy as np
+import torch
+import torch.nn as nn
+from lama_cleaner.helper import get_cache_path_by_url, load_jit_model, norm_img
+from lama_cleaner.model.base import InpaintModel
+from lama_cleaner.model.ddim_sampler import DDIMSampler
+from lama_cleaner.model.plms_sampler import PLMSSampler
+from lama_cleaner.model.utils import make_beta_schedule, timestep_embedding
+from lama_cleaner.schema import Config, LDMSampler
+# torch.manual_seed(42)
+def conditional_autocast(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if torch.cuda.is_available():
+            with torch.cuda.amp.autocast():
+                return func(*args, **kwargs)
+        else:
+            return func(*args, **kwargs)
+    return wrapper
+LDM_ENCODE_MODEL_URL = os.environ.get(
+    "LDM_ENCODE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_ldm/cond_stage_model_encode.pt",
+)
+LDM_ENCODE_MODEL_MD5 = os.environ.get(
+    "LDM_ENCODE_MODEL_MD5", "23239fc9081956a3e70de56472b3f296"
+)
+LDM_DECODE_MODEL_URL = os.environ.get(
+    "LDM_DECODE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_ldm/cond_stage_model_decode.pt",
+)
+LDM_DECODE_MODEL_MD5 = os.environ.get(
+    "LDM_DECODE_MODEL_MD5", "fe419cd15a750d37a4733589d0d3585c"
+)
+LDM_DIFFUSION_MODEL_URL = os.environ.get(
+    "LDM_DIFFUSION_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_ldm/diffusion.pt",
+)
+LDM_DIFFUSION_MODEL_MD5 = os.environ.get(
+    "LDM_DIFFUSION_MODEL_MD5", "b0afda12bf790c03aba2a7431f11d22d"
+)
+class DDPM(nn.Module):
+    # classic DDPM with Gaussian diffusion, in image space
+    def __init__(
+        self,
+        device,
+        timesteps=1000,
+        beta_schedule="linear",
+        linear_start=0.0015,
+        linear_end=0.0205,
+        cosine_s=0.008,
+        original_elbo_weight=0.0,
+        v_posterior=0.0,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
+        l_simple_weight=1.0,
+        parameterization="eps",  # all assuming fixed variance schedules
+        use_positional_encodings=False,
+    ):
+        super().__init__()
+        self.device = device
+        self.parameterization = parameterization
+        self.use_positional_encodings = use_positional_encodings
+        self.v_posterior = v_posterior
+        self.original_elbo_weight = original_elbo_weight
+        self.l_simple_weight = l_simple_weight
+        self.register_schedule(
+            beta_schedule=beta_schedule,
+            timesteps=timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s,
+        )
+    def register_schedule(
+        self,
+        given_betas=None,
+        beta_schedule="linear",
+        timesteps=1000,
+        linear_start=1e-4,
+        linear_end=2e-2,
+        cosine_s=8e-3,
+    ):
+        betas = make_beta_schedule(
+            self.device,
+            beta_schedule,
+            timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s,
+        )
+        alphas = 1.0 - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+        (timesteps,) = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert (
+            alphas_cumprod.shape[0] == self.num_timesteps
+        ), "alphas have to be defined for each timestep"
+        def to_torch(x): return torch.tensor(x, dtype=torch.float32).to(self.device)
+        self.register_buffer("betas", to_torch(betas))
+        self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
+        self.register_buffer("alphas_cumprod_prev", to_torch(alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer("sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod", to_torch(np.sqrt(1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod))
+        )
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod - 1))
+        )
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (1 - self.v_posterior) * betas * (
+            1.0 - alphas_cumprod_prev
+        ) / (1.0 - alphas_cumprod) + self.v_posterior * betas
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        self.register_buffer("posterior_variance", to_torch(posterior_variance))
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer(
+            "posterior_log_variance_clipped",
+            to_torch(np.log(np.maximum(posterior_variance, 1e-20))),
+        )
+        self.register_buffer(
+            "posterior_mean_coef1",
+            to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)),
+        )
+        self.register_buffer(
+            "posterior_mean_coef2",
+            to_torch(
+                (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod)
+            ),
+        )
+        if self.parameterization == "eps":
+            lvlb_weights = self.betas**2 / (
+                2
+                * self.posterior_variance
+                * to_torch(alphas)
+                * (1 - self.alphas_cumprod)
+            )
+        elif self.parameterization == "x0":
+            lvlb_weights = (
+                0.5
+                * np.sqrt(torch.Tensor(alphas_cumprod))
+                / (2.0 * 1 - torch.Tensor(alphas_cumprod))
+            )
+        else:
+            raise NotImplementedError("mu not supported")
+        # TODO how to choose this term
+        lvlb_weights[0] = lvlb_weights[1]
+        self.register_buffer("lvlb_weights", lvlb_weights, persistent=False)
+        assert not torch.isnan(self.lvlb_weights).all()
+class LatentDiffusion(DDPM):
+    def __init__(
+        self,
+        diffusion_model,
+        device,
+        cond_stage_key="image",
+        cond_stage_trainable=False,
+        concat_mode=True,
+        scale_factor=1.0,
+        scale_by_std=False,
+        *args,
+        **kwargs,
+    ):
+        self.num_timesteps_cond = 1
+        self.scale_by_std = scale_by_std
+        super().__init__(device, *args, **kwargs)
+        self.diffusion_model = diffusion_model
+        self.concat_mode = concat_mode
+        self.cond_stage_trainable = cond_stage_trainable
+        self.cond_stage_key = cond_stage_key
+        self.num_downs = 2
+        self.scale_factor = scale_factor
+    def make_cond_schedule(
+        self,
+    ):
+        self.cond_ids = torch.full(
+            size=(self.num_timesteps,),
+            fill_value=self.num_timesteps - 1,
+            dtype=torch.long,
+        )
+        ids = torch.round(
+            torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)
+        ).long()
+        self.cond_ids[: self.num_timesteps_cond] = ids
+    def register_schedule(
+        self,
+        given_betas=None,
+        beta_schedule="linear",
+        timesteps=1000,
+        linear_start=1e-4,
+        linear_end=2e-2,
+        cosine_s=8e-3,
+    ):
+        super().register_schedule(
+            given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s
+        )
+        self.shorten_cond_schedule = self.num_timesteps_cond > 1
+        if self.shorten_cond_schedule:
+            self.make_cond_schedule()
+    def apply_model(self, x_noisy, t, cond):
+        # x_recon = self.model(x_noisy, t, cond['c_concat'][0])  # cond['c_concat'][0].shape 1,4,128,128
+        t_emb = timestep_embedding(x_noisy.device, t, 256, repeat_only=False)
+        x_recon = self.diffusion_model(x_noisy, t_emb, cond)
+        return x_recon
+class LDM(InpaintModel):
+    name = "ldm"
+    pad_mod = 32
+    def __init__(self, device, fp16: bool = True, **kwargs):
+        self.fp16 = fp16
+        super().__init__(device)
+        self.device = device
+    def init_model(self, device, **kwargs):
+        self.diffusion_model = load_jit_model(
+            LDM_DIFFUSION_MODEL_URL, device, LDM_DIFFUSION_MODEL_MD5
+        )
+        self.cond_stage_model_decode = load_jit_model(
+            LDM_DECODE_MODEL_URL, device, LDM_DECODE_MODEL_MD5
+        )
+        self.cond_stage_model_encode = load_jit_model(
+            LDM_ENCODE_MODEL_URL, device, LDM_ENCODE_MODEL_MD5
+        )
+        if self.fp16 and "cuda" in str(device):
+            self.diffusion_model = self.diffusion_model.half()
+            self.cond_stage_model_decode = self.cond_stage_model_decode.half()
+            self.cond_stage_model_encode = self.cond_stage_model_encode.half()
+        self.model = LatentDiffusion(self.diffusion_model, device)
+    @staticmethod
+    def is_downloaded() -> bool:
+        model_paths = [
+            get_cache_path_by_url(LDM_DIFFUSION_MODEL_URL),
+            get_cache_path_by_url(LDM_DECODE_MODEL_URL),
+            get_cache_path_by_url(LDM_ENCODE_MODEL_URL),
+        ]
+        return all([os.path.exists(it) for it in model_paths])
+    @conditional_autocast
+    def forward(self, image, mask, config: Config):
+        """
+        image: [H, W, C] RGB
+        mask: [H, W, 1]
+        return: BGR IMAGE
+        """
+        # image [1,3,512,512] float32
+        # mask: [1,1,512,512] float32
+        # masked_image: [1,3,512,512] float32
+        if config.ldm_sampler == LDMSampler.ddim:
+            sampler = DDIMSampler(self.model)
+        elif config.ldm_sampler == LDMSampler.plms:
+            sampler = PLMSSampler(self.model)
+        else:
+            raise ValueError()
+        steps = config.ldm_steps
+        image = norm_img(image)
+        mask = norm_img(mask)
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        image = torch.from_numpy(image).unsqueeze(0).to(self.device)
+        mask = torch.from_numpy(mask).unsqueeze(0).to(self.device)
+        masked_image = (1 - mask) * image
+        mask = self._norm(mask)
+        masked_image = self._norm(masked_image)
+        c = self.cond_stage_model_encode(masked_image)
+        torch.cuda.empty_cache()
+        cc = torch.nn.functional.interpolate(mask, size=c.shape[-2:])  # 1,1,128,128
+        c = torch.cat((c, cc), dim=1)  # 1,4,128,128
+        shape = (c.shape[1] - 1,) + c.shape[2:]
+        samples_ddim = sampler.sample(
+            steps=steps, conditioning=c, batch_size=c.shape[0], shape=shape
+        )
+        torch.cuda.empty_cache()
+        x_samples_ddim = self.cond_stage_model_decode(
+            samples_ddim
+        )  # samples_ddim: 1, 3, 128, 128 float32
+        torch.cuda.empty_cache()
+        # image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
+        # mask = torch.clamp((mask + 1.0) / 2.0, min=0.0, max=1.0)
+        inpainted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+        # inpainted = (1 - mask) * image + mask * predicted_image
+        inpainted_image = inpainted_image.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255
+        inpainted_image = inpainted_image.astype(np.uint8)[:, :, ::-1]
+        return inpainted_image
+    def _norm(self, tensor):
+        return tensor * 2.0 - 1.0

lama_cleaner/model/manga.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import random
+import cv2
+import numpy as np
+import torch
+import time
+from loguru import logger
+from lama_cleaner.helper import get_cache_path_by_url, load_jit_model
+from lama_cleaner.model.base import InpaintModel
+from lama_cleaner.schema import Config
+MANGA_INPAINTOR_MODEL_URL = os.environ.get(
+    "MANGA_INPAINTOR_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/manga/manga_inpaintor.jit",
+)
+MANGA_INPAINTOR_MODEL_MD5 = os.environ.get(
+    "MANGA_INPAINTOR_MODEL_MD5", "7d8b269c4613b6b3768af714610da86c"
+)
+MANGA_LINE_MODEL_URL = os.environ.get(
+    "MANGA_LINE_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/manga/erika.jit",
+)
+MANGA_LINE_MODEL_MD5 = os.environ.get(
+    "MANGA_LINE_MODEL_MD5", "0c926d5a4af8450b0d00bc5b9a095644"
+)
+class Manga(InpaintModel):
+    name = "manga"
+    pad_mod = 16
+    def init_model(self, device, **kwargs):
+        self.inpaintor_model = load_jit_model(
+            MANGA_INPAINTOR_MODEL_URL, device, MANGA_INPAINTOR_MODEL_MD5
+        )
+        self.line_model = load_jit_model(
+            MANGA_LINE_MODEL_URL, device, MANGA_LINE_MODEL_MD5
+        )
+        self.seed = 42
+    @staticmethod
+    def is_downloaded() -> bool:
+        model_paths = [
+            get_cache_path_by_url(MANGA_INPAINTOR_MODEL_URL),
+            get_cache_path_by_url(MANGA_LINE_MODEL_URL),
+        ]
+        return all([os.path.exists(it) for it in model_paths])
+    def forward(self, image, mask, config: Config):
+        """
+        image: [H, W, C] RGB
+        mask: [H, W, 1]
+        return: BGR IMAGE
+        """
+        seed = self.seed
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        gray_img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+        gray_img = torch.from_numpy(
+            gray_img[np.newaxis, np.newaxis, :, :].astype(np.float32)
+        ).to(self.device)
+        start = time.time()
+        lines = self.line_model(gray_img)
+        torch.cuda.empty_cache()
+        lines = torch.clamp(lines, 0, 255)
+        logger.info(f"erika_model time: {time.time() - start}")
+        mask = torch.from_numpy(mask[np.newaxis, :, :, :]).to(self.device)
+        mask = mask.permute(0, 3, 1, 2)
+        mask = torch.where(mask > 0.5, 1.0, 0.0)
+        noise = torch.randn_like(mask)
+        ones = torch.ones_like(mask)
+        gray_img = gray_img / 255 * 2 - 1.0
+        lines = lines / 255 * 2 - 1.0
+        start = time.time()
+        inpainted_image = self.inpaintor_model(gray_img, lines, mask, noise, ones)
+        logger.info(f"image_inpaintor_model time: {time.time() - start}")
+        cur_res = inpainted_image[0].permute(1, 2, 0).detach().cpu().numpy()
+        cur_res = (cur_res * 127.5 + 127.5).astype(np.uint8)
+        cur_res = cv2.cvtColor(cur_res, cv2.COLOR_GRAY2BGR)
+        return cur_res

lama_cleaner/model/mat.py ADDED Viewed

	@@ -0,0 +1,1935 @@

+import os
+import random
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from lama_cleaner.helper import load_model, get_cache_path_by_url, norm_img
+from lama_cleaner.model.base import InpaintModel
+from lama_cleaner.model.utils import (
+    setup_filter,
+    Conv2dLayer,
+    FullyConnectedLayer,
+    conv2d_resample,
+    bias_act,
+    upsample2d,
+    activation_funcs,
+    MinibatchStdLayer,
+    to_2tuple,
+    normalize_2nd_moment,
+    set_seed,
+)
+from lama_cleaner.schema import Config
+class ModulatedConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels.
+        out_channels,  # Number of output channels.
+        kernel_size,  # Width and height of the convolution kernel.
+        style_dim,  # dimension of the style code
+        demodulate=True,  # perfrom demodulation
+        up=1,  # Integer upsampling factor.
+        down=1,  # Integer downsampling factor.
+        resample_filter=[
+            1,
+            3,
+            3,
+            1,
+        ],  # Low-pass filter to apply when resampling activations.
+        conv_clamp=None,  # Clamp the output to +-X, None = disable clamping.
+    ):
+        super().__init__()
+        self.demodulate = demodulate
+        self.weight = torch.nn.Parameter(
+            torch.randn([1, out_channels, in_channels, kernel_size, kernel_size])
+        )
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2))
+        self.padding = self.kernel_size // 2
+        self.up = up
+        self.down = down
+        self.register_buffer("resample_filter", setup_filter(resample_filter))
+        self.conv_clamp = conv_clamp
+        self.affine = FullyConnectedLayer(style_dim, in_channels, bias_init=1)
+    def forward(self, x, style):
+        batch, in_channels, height, width = x.shape
+        style = self.affine(style).view(batch, 1, in_channels, 1, 1)
+        weight = self.weight * self.weight_gain * style
+        if self.demodulate:
+            decoefs = (weight.pow(2).sum(dim=[2, 3, 4]) + 1e-8).rsqrt()
+            weight = weight * decoefs.view(batch, self.out_channels, 1, 1, 1)
+        weight = weight.view(
+            batch * self.out_channels, in_channels, self.kernel_size, self.kernel_size
+        )
+        x = x.view(1, batch * in_channels, height, width)
+        x = conv2d_resample(
+            x=x,
+            w=weight,
+            f=self.resample_filter,
+            up=self.up,
+            down=self.down,
+            padding=self.padding,
+            groups=batch,
+        )
+        out = x.view(batch, self.out_channels, *x.shape[2:])
+        return out
+class StyleConv(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels.
+        out_channels,  # Number of output channels.
+        style_dim,  # Intermediate latent (W) dimensionality.
+        resolution,  # Resolution of this layer.
+        kernel_size=3,  # Convolution kernel size.
+        up=1,  # Integer upsampling factor.
+        use_noise=False,  # Enable noise input?
+        activation="lrelu",  # Activation function: 'relu', 'lrelu', etc.
+        resample_filter=[
+            1,
+            3,
+            3,
+            1,
+        ],  # Low-pass filter to apply when resampling activations.
+        conv_clamp=None,  # Clamp the output of convolution layers to +-X, None = disable clamping.
+        demodulate=True,  # perform demodulation
+    ):
+        super().__init__()
+        self.conv = ModulatedConv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            style_dim=style_dim,
+            demodulate=demodulate,
+            up=up,
+            resample_filter=resample_filter,
+            conv_clamp=conv_clamp,
+        )
+        self.use_noise = use_noise
+        self.resolution = resolution
+        if use_noise:
+            self.register_buffer("noise_const", torch.randn([resolution, resolution]))
+            self.noise_strength = torch.nn.Parameter(torch.zeros([]))
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+        self.activation = activation
+        self.act_gain = activation_funcs[activation].def_gain
+        self.conv_clamp = conv_clamp
+    def forward(self, x, style, noise_mode="random", gain=1):
+        x = self.conv(x, style)
+        assert noise_mode in ["random", "const", "none"]
+        if self.use_noise:
+            if noise_mode == "random":
+                xh, xw = x.size()[-2:]
+                noise = (
+                    torch.randn([x.shape[0], 1, xh, xw], device=x.device)
+                    * self.noise_strength
+                )
+            if noise_mode == "const":
+                noise = self.noise_const * self.noise_strength
+            x = x + noise
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        out = bias_act(
+            x, self.bias, act=self.activation, gain=act_gain, clamp=act_clamp
+        )
+        return out
+class ToRGB(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        style_dim,
+        kernel_size=1,
+        resample_filter=[1, 3, 3, 1],
+        conv_clamp=None,
+        demodulate=False,
+    ):
+        super().__init__()
+        self.conv = ModulatedConv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            style_dim=style_dim,
+            demodulate=demodulate,
+            resample_filter=resample_filter,
+            conv_clamp=conv_clamp,
+        )
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+        self.register_buffer("resample_filter", setup_filter(resample_filter))
+        self.conv_clamp = conv_clamp
+    def forward(self, x, style, skip=None):
+        x = self.conv(x, style)
+        out = bias_act(x, self.bias, clamp=self.conv_clamp)
+        if skip is not None:
+            if skip.shape != out.shape:
+                skip = upsample2d(skip, self.resample_filter)
+            out = out + skip
+        return out
+def get_style_code(a, b):
+    return torch.cat([a, b], dim=1)
+class DecBlockFirst(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        activation,
+        style_dim,
+        use_noise,
+        demodulate,
+        img_channels,
+    ):
+        super().__init__()
+        self.fc = FullyConnectedLayer(
+            in_features=in_channels * 2,
+            out_features=in_channels * 4 ** 2,
+            activation=activation,
+        )
+        self.conv = StyleConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=4,
+            kernel_size=3,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.toRGB = ToRGB(
+            in_channels=out_channels,
+            out_channels=img_channels,
+            style_dim=style_dim,
+            kernel_size=1,
+            demodulate=False,
+        )
+    def forward(self, x, ws, gs, E_features, noise_mode="random"):
+        x = self.fc(x).view(x.shape[0], -1, 4, 4)
+        x = x + E_features[2]
+        style = get_style_code(ws[:, 0], gs)
+        x = self.conv(x, style, noise_mode=noise_mode)
+        style = get_style_code(ws[:, 1], gs)
+        img = self.toRGB(x, style, skip=None)
+        return x, img
+class DecBlockFirstV2(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        activation,
+        style_dim,
+        use_noise,
+        demodulate,
+        img_channels,
+    ):
+        super().__init__()
+        self.conv0 = Conv2dLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            activation=activation,
+        )
+        self.conv1 = StyleConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=4,
+            kernel_size=3,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.toRGB = ToRGB(
+            in_channels=out_channels,
+            out_channels=img_channels,
+            style_dim=style_dim,
+            kernel_size=1,
+            demodulate=False,
+        )
+    def forward(self, x, ws, gs, E_features, noise_mode="random"):
+        # x = self.fc(x).view(x.shape[0], -1, 4, 4)
+        x = self.conv0(x)
+        x = x + E_features[2]
+        style = get_style_code(ws[:, 0], gs)
+        x = self.conv1(x, style, noise_mode=noise_mode)
+        style = get_style_code(ws[:, 1], gs)
+        img = self.toRGB(x, style, skip=None)
+        return x, img
+class DecBlock(nn.Module):
+    def __init__(
+        self,
+        res,
+        in_channels,
+        out_channels,
+        activation,
+        style_dim,
+        use_noise,
+        demodulate,
+        img_channels,
+    ):  # res = 2, ..., resolution_log2
+        super().__init__()
+        self.res = res
+        self.conv0 = StyleConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=2 ** res,
+            kernel_size=3,
+            up=2,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.conv1 = StyleConv(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=2 ** res,
+            kernel_size=3,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.toRGB = ToRGB(
+            in_channels=out_channels,
+            out_channels=img_channels,
+            style_dim=style_dim,
+            kernel_size=1,
+            demodulate=False,
+        )
+    def forward(self, x, img, ws, gs, E_features, noise_mode="random"):
+        style = get_style_code(ws[:, self.res * 2 - 5], gs)
+        x = self.conv0(x, style, noise_mode=noise_mode)
+        x = x + E_features[self.res]
+        style = get_style_code(ws[:, self.res * 2 - 4], gs)
+        x = self.conv1(x, style, noise_mode=noise_mode)
+        style = get_style_code(ws[:, self.res * 2 - 3], gs)
+        img = self.toRGB(x, style, skip=img)
+        return x, img
+class MappingNet(torch.nn.Module):
+    def __init__(
+        self,
+        z_dim,  # Input latent (Z) dimensionality, 0 = no latent.
+        c_dim,  # Conditioning label (C) dimensionality, 0 = no label.
+        w_dim,  # Intermediate latent (W) dimensionality.
+        num_ws,  # Number of intermediate latents to output, None = do not broadcast.
+        num_layers=8,  # Number of mapping layers.
+        embed_features=None,  # Label embedding dimensionality, None = same as w_dim.
+        layer_features=None,  # Number of intermediate features in the mapping layers, None = same as w_dim.
+        activation="lrelu",  # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier=0.01,  # Learning rate multiplier for the mapping layers.
+        w_avg_beta=0.995,  # Decay for tracking the moving average of W during training, None = do not track.
+        torch_dtype=torch.float32,
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.num_ws = num_ws
+        self.num_layers = num_layers
+        self.w_avg_beta = w_avg_beta
+        self.torch_dtype = torch_dtype
+        if embed_features is None:
+            embed_features = w_dim
+        if c_dim == 0:
+            embed_features = 0
+        if layer_features is None:
+            layer_features = w_dim
+        features_list = (
+            [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim]
+        )
+        if c_dim > 0:
+            self.embed = FullyConnectedLayer(c_dim, embed_features)
+        for idx in range(num_layers):
+            in_features = features_list[idx]
+            out_features = features_list[idx + 1]
+            layer = FullyConnectedLayer(
+                in_features,
+                out_features,
+                activation=activation,
+                lr_multiplier=lr_multiplier,
+            )
+            setattr(self, f"fc{idx}", layer)
+        if num_ws is not None and w_avg_beta is not None:
+            self.register_buffer("w_avg", torch.zeros([w_dim]))
+    def forward(
+        self, z, c, truncation_psi=1, truncation_cutoff=None, skip_w_avg_update=False
+    ):
+        # Embed, normalize, and concat inputs.
+        x = None
+        if self.z_dim > 0:
+            x = normalize_2nd_moment(z)
+        if self.c_dim > 0:
+            y = normalize_2nd_moment(self.embed(c))
+            x = torch.cat([x, y], dim=1) if x is not None else y
+        # Main layers.
+        for idx in range(self.num_layers):
+            layer = getattr(self, f"fc{idx}")
+            x = layer(x)
+        # Update moving average of W.
+        if self.w_avg_beta is not None and self.training and not skip_w_avg_update:
+            self.w_avg.copy_(x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta))
+        # Broadcast.
+        if self.num_ws is not None:
+            x = x.unsqueeze(1).repeat([1, self.num_ws, 1])
+        # Apply truncation.
+        if truncation_psi != 1:
+            assert self.w_avg_beta is not None
+            if self.num_ws is None or truncation_cutoff is None:
+                x = self.w_avg.lerp(x, truncation_psi)
+            else:
+                x[:, :truncation_cutoff] = self.w_avg.lerp(
+                    x[:, :truncation_cutoff], truncation_psi
+                )
+        return x
+class DisFromRGB(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, activation
+    ):  # res = 2, ..., resolution_log2
+        super().__init__()
+        self.conv = Conv2dLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            activation=activation,
+        )
+    def forward(self, x):
+        return self.conv(x)
+class DisBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, activation
+    ):  # res = 2, ..., resolution_log2
+        super().__init__()
+        self.conv0 = Conv2dLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            activation=activation,
+        )
+        self.conv1 = Conv2dLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            down=2,
+            activation=activation,
+        )
+        self.skip = Conv2dLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            down=2,
+            bias=False,
+        )
+    def forward(self, x):
+        skip = self.skip(x, gain=np.sqrt(0.5))
+        x = self.conv0(x)
+        x = self.conv1(x, gain=np.sqrt(0.5))
+        out = skip + x
+        return out
+class Discriminator(torch.nn.Module):
+    def __init__(
+        self,
+        c_dim,  # Conditioning label (C) dimensionality.
+        img_resolution,  # Input resolution.
+        img_channels,  # Number of input color channels.
+        channel_base=32768,  # Overall multiplier for the number of channels.
+        channel_max=512,  # Maximum number of channels in any layer.
+        channel_decay=1,
+        cmap_dim=None,  # Dimensionality of mapped conditioning label, None = default.
+        activation="lrelu",
+        mbstd_group_size=4,  # Group size for the minibatch standard deviation layer, None = entire minibatch.
+        mbstd_num_channels=1,  # Number of features for the minibatch standard deviation layer, 0 = disable.
+    ):
+        super().__init__()
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        resolution_log2 = int(np.log2(img_resolution))
+        assert img_resolution == 2 ** resolution_log2 and img_resolution >= 4
+        self.resolution_log2 = resolution_log2
+        def nf(stage):
+            return np.clip(
+                int(channel_base / 2 ** (stage * channel_decay)), 1, channel_max
+            )
+        if cmap_dim == None:
+            cmap_dim = nf(2)
+        if c_dim == 0:
+            cmap_dim = 0
+        self.cmap_dim = cmap_dim
+        if c_dim > 0:
+            self.mapping = MappingNet(
+                z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None
+            )
+        Dis = [DisFromRGB(img_channels + 1, nf(resolution_log2), activation)]
+        for res in range(resolution_log2, 2, -1):
+            Dis.append(DisBlock(nf(res), nf(res - 1), activation))
+        if mbstd_num_channels > 0:
+            Dis.append(
+                MinibatchStdLayer(
+                    group_size=mbstd_group_size, num_channels=mbstd_num_channels
+                )
+            )
+        Dis.append(
+            Conv2dLayer(
+                nf(2) + mbstd_num_channels, nf(2), kernel_size=3, activation=activation
+            )
+        )
+        self.Dis = nn.Sequential(*Dis)
+        self.fc0 = FullyConnectedLayer(nf(2) * 4 ** 2, nf(2), activation=activation)
+        self.fc1 = FullyConnectedLayer(nf(2), 1 if cmap_dim == 0 else cmap_dim)
+    def forward(self, images_in, masks_in, c):
+        x = torch.cat([masks_in - 0.5, images_in], dim=1)
+        x = self.Dis(x)
+        x = self.fc1(self.fc0(x.flatten(start_dim=1)))
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        if self.cmap_dim > 0:
+            x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim))
+        return x
+def nf(stage, channel_base=32768, channel_decay=1.0, channel_max=512):
+    NF = {512: 64, 256: 128, 128: 256, 64: 512, 32: 512, 16: 512, 8: 512, 4: 512}
+    return NF[2 ** stage]
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = FullyConnectedLayer(
+            in_features=in_features, out_features=hidden_features, activation="lrelu"
+        )
+        self.fc2 = FullyConnectedLayer(
+            in_features=hidden_features, out_features=out_features
+        )
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows
+def window_reverse(windows, window_size: int, H: int, W: int):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    # B = windows.shape[0] / (H * W / window_size / window_size)
+    x = windows.view(
+        B, H // window_size, W // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class Conv2dLayerPartial(nn.Module):
+    def __init__(
+        self,
+        in_channels,  # Number of input channels.
+        out_channels,  # Number of output channels.
+        kernel_size,  # Width and height of the convolution kernel.
+        bias=True,  # Apply additive bias before the activation function?
+        activation="linear",  # Activation function: 'relu', 'lrelu', etc.
+        up=1,  # Integer upsampling factor.
+        down=1,  # Integer downsampling factor.
+        resample_filter=[
+            1,
+            3,
+            3,
+            1,
+        ],  # Low-pass filter to apply when resampling activations.
+        conv_clamp=None,  # Clamp the output to +-X, None = disable clamping.
+        trainable=True,  # Update the weights of this layer during training?
+    ):
+        super().__init__()
+        self.conv = Conv2dLayer(
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias,
+            activation,
+            up,
+            down,
+            resample_filter,
+            conv_clamp,
+            trainable,
+        )
+        self.weight_maskUpdater = torch.ones(1, 1, kernel_size, kernel_size)
+        self.slide_winsize = kernel_size ** 2
+        self.stride = down
+        self.padding = kernel_size // 2 if kernel_size % 2 == 1 else 0
+    def forward(self, x, mask=None):
+        if mask is not None:
+            with torch.no_grad():
+                if self.weight_maskUpdater.type() != x.type():
+                    self.weight_maskUpdater = self.weight_maskUpdater.to(x)
+                update_mask = F.conv2d(
+                    mask,
+                    self.weight_maskUpdater,
+                    bias=None,
+                    stride=self.stride,
+                    padding=self.padding,
+                )
+                mask_ratio = self.slide_winsize / (update_mask.to(torch.float32) + 1e-8)
+                update_mask = torch.clamp(update_mask, 0, 1)  # 0 or 1
+                mask_ratio = torch.mul(mask_ratio, update_mask).to(x.dtype)
+            x = self.conv(x)
+            x = torch.mul(x, mask_ratio)
+            return x, update_mask
+        else:
+            x = self.conv(x)
+            return x, None
+class WindowAttention(nn.Module):
+    r"""Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        down_ratio=1,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q = FullyConnectedLayer(in_features=dim, out_features=dim)
+        self.k = FullyConnectedLayer(in_features=dim, out_features=dim)
+        self.v = FullyConnectedLayer(in_features=dim, out_features=dim)
+        self.proj = FullyConnectedLayer(in_features=dim, out_features=dim)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask_windows=None, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        norm_x = F.normalize(x, p=2.0, dim=-1, eps=torch.finfo(x.dtype).eps)
+        q = (
+            self.q(norm_x)
+            .reshape(B_, N, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        k = (
+            self.k(norm_x)
+            .view(B_, -1, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 3, 1)
+        )
+        v = (
+            self.v(x)
+            .view(B_, -1, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        attn = (q @ k) * self.scale
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
+                1
+            ).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        if mask_windows is not None:
+            attn_mask_windows = mask_windows.squeeze(-1).unsqueeze(1).unsqueeze(1)
+            attn = attn + attn_mask_windows.masked_fill(
+                attn_mask_windows == 0, float(-100.0)
+            ).masked_fill(attn_mask_windows == 1, float(0.0))
+            with torch.no_grad():
+                mask_windows = torch.clamp(
+                    torch.sum(mask_windows, dim=1, keepdim=True), 0, 1
+                ).repeat(1, N, 1)
+        attn = self.softmax(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        return x, mask_windows
+class SwinTransformerBlock(nn.Module):
+    r"""Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        num_heads,
+        down_ratio=1,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert (
+            0 <= self.shift_size < self.window_size
+        ), "shift_size must in 0-window_size"
+        if self.shift_size > 0:
+            down_ratio = 1
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            down_ratio=down_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.fuse = FullyConnectedLayer(
+            in_features=dim * 2, out_features=dim, activation="lrelu"
+        )
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        if self.shift_size > 0:
+            attn_mask = self.calculate_mask(self.input_resolution)
+        else:
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask)
+    def calculate_mask(self, x_size):
+        # calculate attention mask for SW-MSA
+        H, W = x_size
+        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+        return attn_mask
+    def forward(self, x, x_size, mask=None):
+        # H, W = self.input_resolution
+        H, W = x_size
+        B, L, C = x.shape
+        # assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = x.view(B, H, W, C)
+        if mask is not None:
+            mask = mask.view(B, H, W, 1)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
+            )
+            if mask is not None:
+                shifted_mask = torch.roll(
+                    mask, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
+                )
+        else:
+            shifted_x = x
+            if mask is not None:
+                shifted_mask = mask
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        if mask is not None:
+            mask_windows = window_partition(shifted_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size, 1)
+        else:
+            mask_windows = None
+        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
+        if self.input_resolution == x_size:
+            attn_windows, mask_windows = self.attn(
+                x_windows, mask_windows, mask=self.attn_mask
+            )  # nW*B, window_size*window_size, C
+        else:
+            attn_windows, mask_windows = self.attn(
+                x_windows,
+                mask_windows,
+                mask=self.calculate_mask(x_size).to(x.dtype).to(x.device),
+            )  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        if mask is not None:
+            mask_windows = mask_windows.view(-1, self.window_size, self.window_size, 1)
+            shifted_mask = window_reverse(mask_windows, self.window_size, H, W)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
+            )
+            if mask is not None:
+                mask = torch.roll(
+                    shifted_mask, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
+                )
+        else:
+            x = shifted_x
+            if mask is not None:
+                mask = shifted_mask
+        x = x.view(B, H * W, C)
+        if mask is not None:
+            mask = mask.view(B, H * W, 1)
+        # FFN
+        x = self.fuse(torch.cat([shortcut, x], dim=-1))
+        x = self.mlp(x)
+        return x, mask
+class PatchMerging(nn.Module):
+    def __init__(self, in_channels, out_channels, down=2):
+        super().__init__()
+        self.conv = Conv2dLayerPartial(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            activation="lrelu",
+            down=down,
+        )
+        self.down = down
+    def forward(self, x, x_size, mask=None):
+        x = token2feature(x, x_size)
+        if mask is not None:
+            mask = token2feature(mask, x_size)
+        x, mask = self.conv(x, mask)
+        if self.down != 1:
+            ratio = 1 / self.down
+            x_size = (int(x_size[0] * ratio), int(x_size[1] * ratio))
+        x = feature2token(x)
+        if mask is not None:
+            mask = feature2token(mask)
+        return x, x_size, mask
+class PatchUpsampling(nn.Module):
+    def __init__(self, in_channels, out_channels, up=2):
+        super().__init__()
+        self.conv = Conv2dLayerPartial(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            activation="lrelu",
+            up=up,
+        )
+        self.up = up
+    def forward(self, x, x_size, mask=None):
+        x = token2feature(x, x_size)
+        if mask is not None:
+            mask = token2feature(mask, x_size)
+        x, mask = self.conv(x, mask)
+        if self.up != 1:
+            x_size = (int(x_size[0] * self.up), int(x_size[1] * self.up))
+        x = feature2token(x)
+        if mask is not None:
+            mask = feature2token(mask)
+        return x, x_size, mask
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        num_heads,
+        window_size,
+        down_ratio=1,
+        mlp_ratio=2.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # patch merging layer
+        if downsample is not None:
+            # self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+            self.downsample = downsample
+        else:
+            self.downsample = None
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    down_ratio=down_ratio,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i]
+                    if isinstance(drop_path, list)
+                    else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.conv = Conv2dLayerPartial(
+            in_channels=dim, out_channels=dim, kernel_size=3, activation="lrelu"
+        )
+    def forward(self, x, x_size, mask=None):
+        if self.downsample is not None:
+            x, x_size, mask = self.downsample(x, x_size, mask)
+        identity = x
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x, mask = checkpoint.checkpoint(blk, x, x_size, mask)
+            else:
+                x, mask = blk(x, x_size, mask)
+        if mask is not None:
+            mask = token2feature(mask, x_size)
+        x, mask = self.conv(token2feature(x, x_size), mask)
+        x = feature2token(x) + identity
+        if mask is not None:
+            mask = feature2token(mask)
+        return x, x_size, mask
+class ToToken(nn.Module):
+    def __init__(self, in_channels=3, dim=128, kernel_size=5, stride=1):
+        super().__init__()
+        self.proj = Conv2dLayerPartial(
+            in_channels=in_channels,
+            out_channels=dim,
+            kernel_size=kernel_size,
+            activation="lrelu",
+        )
+    def forward(self, x, mask):
+        x, mask = self.proj(x, mask)
+        return x, mask
+class EncFromRGB(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, activation
+    ):  # res = 2, ..., resolution_log2
+        super().__init__()
+        self.conv0 = Conv2dLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            activation=activation,
+        )
+        self.conv1 = Conv2dLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            activation=activation,
+        )
+    def forward(self, x):
+        x = self.conv0(x)
+        x = self.conv1(x)
+        return x
+class ConvBlockDown(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, activation
+    ):  # res = 2, ..., resolution_log
+        super().__init__()
+        self.conv0 = Conv2dLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            activation=activation,
+            down=2,
+        )
+        self.conv1 = Conv2dLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            activation=activation,
+        )
+    def forward(self, x):
+        x = self.conv0(x)
+        x = self.conv1(x)
+        return x
+def token2feature(x, x_size):
+    B, N, C = x.shape
+    h, w = x_size
+    x = x.permute(0, 2, 1).reshape(B, C, h, w)
+    return x
+def feature2token(x):
+    B, C, H, W = x.shape
+    x = x.view(B, C, -1).transpose(1, 2)
+    return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        res_log2,
+        img_channels,
+        activation,
+        patch_size=5,
+        channels=16,
+        drop_path_rate=0.1,
+    ):
+        super().__init__()
+        self.resolution = []
+        for idx, i in enumerate(range(res_log2, 3, -1)):  # from input size to 16x16
+            res = 2 ** i
+            self.resolution.append(res)
+            if i == res_log2:
+                block = EncFromRGB(img_channels * 2 + 1, nf(i), activation)
+            else:
+                block = ConvBlockDown(nf(i + 1), nf(i), activation)
+            setattr(self, "EncConv_Block_%dx%d" % (res, res), block)
+    def forward(self, x):
+        out = {}
+        for res in self.resolution:
+            res_log2 = int(np.log2(res))
+            x = getattr(self, "EncConv_Block_%dx%d" % (res, res))(x)
+            out[res_log2] = x
+        return out
+class ToStyle(nn.Module):
+    def __init__(self, in_channels, out_channels, activation, drop_rate):
+        super().__init__()
+        self.conv = nn.Sequential(
+            Conv2dLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                activation=activation,
+                down=2,
+            ),
+            Conv2dLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                activation=activation,
+                down=2,
+            ),
+            Conv2dLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                activation=activation,
+                down=2,
+            ),
+        )
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = FullyConnectedLayer(
+            in_features=in_channels, out_features=out_channels, activation=activation
+        )
+        # self.dropout = nn.Dropout(drop_rate)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.pool(x)
+        x = self.fc(x.flatten(start_dim=1))
+        # x = self.dropout(x)
+        return x
+class DecBlockFirstV2(nn.Module):
+    def __init__(
+        self,
+        res,
+        in_channels,
+        out_channels,
+        activation,
+        style_dim,
+        use_noise,
+        demodulate,
+        img_channels,
+    ):
+        super().__init__()
+        self.res = res
+        self.conv0 = Conv2dLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            activation=activation,
+        )
+        self.conv1 = StyleConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=2 ** res,
+            kernel_size=3,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.toRGB = ToRGB(
+            in_channels=out_channels,
+            out_channels=img_channels,
+            style_dim=style_dim,
+            kernel_size=1,
+            demodulate=False,
+        )
+    def forward(self, x, ws, gs, E_features, noise_mode="random"):
+        # x = self.fc(x).view(x.shape[0], -1, 4, 4)
+        x = self.conv0(x)
+        x = x + E_features[self.res]
+        style = get_style_code(ws[:, 0], gs)
+        x = self.conv1(x, style, noise_mode=noise_mode)
+        style = get_style_code(ws[:, 1], gs)
+        img = self.toRGB(x, style, skip=None)
+        return x, img
+class DecBlock(nn.Module):
+    def __init__(
+        self,
+        res,
+        in_channels,
+        out_channels,
+        activation,
+        style_dim,
+        use_noise,
+        demodulate,
+        img_channels,
+    ):  # res = 4, ..., resolution_log2
+        super().__init__()
+        self.res = res
+        self.conv0 = StyleConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=2 ** res,
+            kernel_size=3,
+            up=2,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.conv1 = StyleConv(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=2 ** res,
+            kernel_size=3,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.toRGB = ToRGB(
+            in_channels=out_channels,
+            out_channels=img_channels,
+            style_dim=style_dim,
+            kernel_size=1,
+            demodulate=False,
+        )
+    def forward(self, x, img, ws, gs, E_features, noise_mode="random"):
+        style = get_style_code(ws[:, self.res * 2 - 9], gs)
+        x = self.conv0(x, style, noise_mode=noise_mode)
+        x = x + E_features[self.res]
+        style = get_style_code(ws[:, self.res * 2 - 8], gs)
+        x = self.conv1(x, style, noise_mode=noise_mode)
+        style = get_style_code(ws[:, self.res * 2 - 7], gs)
+        img = self.toRGB(x, style, skip=img)
+        return x, img
+class Decoder(nn.Module):
+    def __init__(
+        self, res_log2, activation, style_dim, use_noise, demodulate, img_channels
+    ):
+        super().__init__()
+        self.Dec_16x16 = DecBlockFirstV2(
+            4, nf(4), nf(4), activation, style_dim, use_noise, demodulate, img_channels
+        )
+        for res in range(5, res_log2 + 1):
+            setattr(
+                self,
+                "Dec_%dx%d" % (2 ** res, 2 ** res),
+                DecBlock(
+                    res,
+                    nf(res - 1),
+                    nf(res),
+                    activation,
+                    style_dim,
+                    use_noise,
+                    demodulate,
+                    img_channels,
+                ),
+            )
+        self.res_log2 = res_log2
+    def forward(self, x, ws, gs, E_features, noise_mode="random"):
+        x, img = self.Dec_16x16(x, ws, gs, E_features, noise_mode=noise_mode)
+        for res in range(5, self.res_log2 + 1):
+            block = getattr(self, "Dec_%dx%d" % (2 ** res, 2 ** res))
+            x, img = block(x, img, ws, gs, E_features, noise_mode=noise_mode)
+        return img
+class DecStyleBlock(nn.Module):
+    def __init__(
+        self,
+        res,
+        in_channels,
+        out_channels,
+        activation,
+        style_dim,
+        use_noise,
+        demodulate,
+        img_channels,
+    ):
+        super().__init__()
+        self.res = res
+        self.conv0 = StyleConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=2 ** res,
+            kernel_size=3,
+            up=2,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.conv1 = StyleConv(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            style_dim=style_dim,
+            resolution=2 ** res,
+            kernel_size=3,
+            use_noise=use_noise,
+            activation=activation,
+            demodulate=demodulate,
+        )
+        self.toRGB = ToRGB(
+            in_channels=out_channels,
+            out_channels=img_channels,
+            style_dim=style_dim,
+            kernel_size=1,
+            demodulate=False,
+        )
+    def forward(self, x, img, style, skip, noise_mode="random"):
+        x = self.conv0(x, style, noise_mode=noise_mode)
+        x = x + skip
+        x = self.conv1(x, style, noise_mode=noise_mode)
+        img = self.toRGB(x, style, skip=img)
+        return x, img
+class FirstStage(nn.Module):
+    def __init__(
+        self,
+        img_channels,
+        img_resolution=256,
+        dim=180,
+        w_dim=512,
+        use_noise=False,
+        demodulate=True,
+        activation="lrelu",
+    ):
+        super().__init__()
+        res = 64
+        self.conv_first = Conv2dLayerPartial(
+            in_channels=img_channels + 1,
+            out_channels=dim,
+            kernel_size=3,
+            activation=activation,
+        )
+        self.enc_conv = nn.ModuleList()
+        down_time = int(np.log2(img_resolution // res))
+        # 根据图片尺寸构建 swim transformer 的层数
+        for i in range(down_time):  # from input size to 64
+            self.enc_conv.append(
+                Conv2dLayerPartial(
+                    in_channels=dim,
+                    out_channels=dim,
+                    kernel_size=3,
+                    down=2,
+                    activation=activation,
+                )
+            )
+        # from 64 -> 16 -> 64
+        depths = [2, 3, 4, 3, 2]
+        ratios = [1, 1 / 2, 1 / 2, 2, 2]
+        num_heads = 6
+        window_sizes = [8, 16, 16, 16, 8]
+        drop_path_rate = 0.1
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        self.tran = nn.ModuleList()
+        for i, depth in enumerate(depths):
+            res = int(res * ratios[i])
+            if ratios[i] < 1:
+                merge = PatchMerging(dim, dim, down=int(1 / ratios[i]))
+            elif ratios[i] > 1:
+                merge = PatchUpsampling(dim, dim, up=ratios[i])
+            else:
+                merge = None
+            self.tran.append(
+                BasicLayer(
+                    dim=dim,
+                    input_resolution=[res, res],
+                    depth=depth,
+                    num_heads=num_heads,
+                    window_size=window_sizes[i],
+                    drop_path=dpr[sum(depths[:i]) : sum(depths[: i + 1])],
+                    downsample=merge,
+                )
+            )
+        # global style
+        down_conv = []
+        for i in range(int(np.log2(16))):
+            down_conv.append(
+                Conv2dLayer(
+                    in_channels=dim,
+                    out_channels=dim,
+                    kernel_size=3,
+                    down=2,
+                    activation=activation,
+                )
+            )
+        down_conv.append(nn.AdaptiveAvgPool2d((1, 1)))
+        self.down_conv = nn.Sequential(*down_conv)
+        self.to_style = FullyConnectedLayer(
+            in_features=dim, out_features=dim * 2, activation=activation
+        )
+        self.ws_style = FullyConnectedLayer(
+            in_features=w_dim, out_features=dim, activation=activation
+        )
+        self.to_square = FullyConnectedLayer(
+            in_features=dim, out_features=16 * 16, activation=activation
+        )
+        style_dim = dim * 3
+        self.dec_conv = nn.ModuleList()
+        for i in range(down_time):  # from 64 to input size
+            res = res * 2
+            self.dec_conv.append(
+                DecStyleBlock(
+                    res,
+                    dim,
+                    dim,
+                    activation,
+                    style_dim,
+                    use_noise,
+                    demodulate,
+                    img_channels,
+                )
+            )
+    def forward(self, images_in, masks_in, ws, noise_mode="random"):
+        x = torch.cat([masks_in - 0.5, images_in * masks_in], dim=1)
+        skips = []
+        x, mask = self.conv_first(x, masks_in)  # input size
+        skips.append(x)
+        for i, block in enumerate(self.enc_conv):  # input size to 64
+            x, mask = block(x, mask)
+            if i != len(self.enc_conv) - 1:
+                skips.append(x)
+        x_size = x.size()[-2:]
+        x = feature2token(x)
+        mask = feature2token(mask)
+        mid = len(self.tran) // 2
+        for i, block in enumerate(self.tran):  # 64 to 16
+            if i < mid:
+                x, x_size, mask = block(x, x_size, mask)
+                skips.append(x)
+            elif i > mid:
+                x, x_size, mask = block(x, x_size, None)
+                x = x + skips[mid - i]
+            else:
+                x, x_size, mask = block(x, x_size, None)
+                mul_map = torch.ones_like(x) * 0.5
+                mul_map = F.dropout(mul_map, training=True)
+                ws = self.ws_style(ws[:, -1])
+                add_n = self.to_square(ws).unsqueeze(1)
+                add_n = (
+                    F.interpolate(
+                        add_n, size=x.size(1), mode="linear", align_corners=False
+                    )
+                    .squeeze(1)
+                    .unsqueeze(-1)
+                )
+                x = x * mul_map + add_n * (1 - mul_map)
+                gs = self.to_style(
+                    self.down_conv(token2feature(x, x_size)).flatten(start_dim=1)
+                )
+                style = torch.cat([gs, ws], dim=1)
+        x = token2feature(x, x_size).contiguous()
+        img = None
+        for i, block in enumerate(self.dec_conv):
+            x, img = block(
+                x, img, style, skips[len(self.dec_conv) - i - 1], noise_mode=noise_mode
+            )
+        # ensemble
+        img = img * (1 - masks_in) + images_in * masks_in
+        return img
+class SynthesisNet(nn.Module):
+    def __init__(
+        self,
+        w_dim,  # Intermediate latent (W) dimensionality.
+        img_resolution,  # Output image resolution.
+        img_channels=3,  # Number of color channels.
+        channel_base=32768,  # Overall multiplier for the number of channels.
+        channel_decay=1.0,
+        channel_max=512,  # Maximum number of channels in any layer.
+        activation="lrelu",  # Activation function: 'relu', 'lrelu', etc.
+        drop_rate=0.5,
+        use_noise=False,
+        demodulate=True,
+    ):
+        super().__init__()
+        resolution_log2 = int(np.log2(img_resolution))
+        assert img_resolution == 2 ** resolution_log2 and img_resolution >= 4
+        self.num_layers = resolution_log2 * 2 - 3 * 2
+        self.img_resolution = img_resolution
+        self.resolution_log2 = resolution_log2
+        # first stage
+        self.first_stage = FirstStage(
+            img_channels,
+            img_resolution=img_resolution,
+            w_dim=w_dim,
+            use_noise=False,
+            demodulate=demodulate,
+        )
+        # second stage
+        self.enc = Encoder(
+            resolution_log2, img_channels, activation, patch_size=5, channels=16
+        )
+        self.to_square = FullyConnectedLayer(
+            in_features=w_dim, out_features=16 * 16, activation=activation
+        )
+        self.to_style = ToStyle(
+            in_channels=nf(4),
+            out_channels=nf(2) * 2,
+            activation=activation,
+            drop_rate=drop_rate,
+        )
+        style_dim = w_dim + nf(2) * 2
+        self.dec = Decoder(
+            resolution_log2, activation, style_dim, use_noise, demodulate, img_channels
+        )
+    def forward(self, images_in, masks_in, ws, noise_mode="random", return_stg1=False):
+        out_stg1 = self.first_stage(images_in, masks_in, ws, noise_mode=noise_mode)
+        # encoder
+        x = images_in * masks_in + out_stg1 * (1 - masks_in)
+        x = torch.cat([masks_in - 0.5, x, images_in * masks_in], dim=1)
+        E_features = self.enc(x)
+        fea_16 = E_features[4]
+        mul_map = torch.ones_like(fea_16) * 0.5
+        mul_map = F.dropout(mul_map, training=True)
+        add_n = self.to_square(ws[:, 0]).view(-1, 16, 16).unsqueeze(1)
+        add_n = F.interpolate(
+            add_n, size=fea_16.size()[-2:], mode="bilinear", align_corners=False
+        )
+        fea_16 = fea_16 * mul_map + add_n * (1 - mul_map)
+        E_features[4] = fea_16
+        # style
+        gs = self.to_style(fea_16)
+        # decoder
+        img = self.dec(fea_16, ws, gs, E_features, noise_mode=noise_mode)
+        # ensemble
+        img = img * (1 - masks_in) + images_in * masks_in
+        if not return_stg1:
+            return img
+        else:
+            return img, out_stg1
+class Generator(nn.Module):
+    def __init__(
+        self,
+        z_dim,  # Input latent (Z) dimensionality, 0 = no latent.
+        c_dim,  # Conditioning label (C) dimensionality, 0 = no label.
+        w_dim,  # Intermediate latent (W) dimensionality.
+        img_resolution,  # resolution of generated image
+        img_channels,  # Number of input color channels.
+        synthesis_kwargs={},  # Arguments for SynthesisNetwork.
+        mapping_kwargs={},  # Arguments for MappingNetwork.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        self.synthesis = SynthesisNet(
+            w_dim=w_dim,
+            img_resolution=img_resolution,
+            img_channels=img_channels,
+            **synthesis_kwargs,
+        )
+        self.mapping = MappingNet(
+            z_dim=z_dim,
+            c_dim=c_dim,
+            w_dim=w_dim,
+            num_ws=self.synthesis.num_layers,
+            **mapping_kwargs,
+        )
+    def forward(
+        self,
+        images_in,
+        masks_in,
+        z,
+        c,
+        truncation_psi=1,
+        truncation_cutoff=None,
+        skip_w_avg_update=False,
+        noise_mode="none",
+        return_stg1=False,
+    ):
+        ws = self.mapping(
+            z,
+            c,
+            truncation_psi=truncation_psi,
+            truncation_cutoff=truncation_cutoff,
+            skip_w_avg_update=skip_w_avg_update,
+        )
+        img = self.synthesis(images_in, masks_in, ws, noise_mode=noise_mode)
+        return img
+class Discriminator(torch.nn.Module):
+    def __init__(
+        self,
+        c_dim,  # Conditioning label (C) dimensionality.
+        img_resolution,  # Input resolution.
+        img_channels,  # Number of input color channels.
+        channel_base=32768,  # Overall multiplier for the number of channels.
+        channel_max=512,  # Maximum number of channels in any layer.
+        channel_decay=1,
+        cmap_dim=None,  # Dimensionality of mapped conditioning label, None = default.
+        activation="lrelu",
+        mbstd_group_size=4,  # Group size for the minibatch standard deviation layer, None = entire minibatch.
+        mbstd_num_channels=1,  # Number of features for the minibatch standard deviation layer, 0 = disable.
+    ):
+        super().__init__()
+        self.c_dim = c_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        resolution_log2 = int(np.log2(img_resolution))
+        assert img_resolution == 2 ** resolution_log2 and img_resolution >= 4
+        self.resolution_log2 = resolution_log2
+        if cmap_dim == None:
+            cmap_dim = nf(2)
+        if c_dim == 0:
+            cmap_dim = 0
+        self.cmap_dim = cmap_dim
+        if c_dim > 0:
+            self.mapping = MappingNet(
+                z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None
+            )
+        Dis = [DisFromRGB(img_channels + 1, nf(resolution_log2), activation)]
+        for res in range(resolution_log2, 2, -1):
+            Dis.append(DisBlock(nf(res), nf(res - 1), activation))
+        if mbstd_num_channels > 0:
+            Dis.append(
+                MinibatchStdLayer(
+                    group_size=mbstd_group_size, num_channels=mbstd_num_channels
+                )
+            )
+        Dis.append(
+            Conv2dLayer(
+                nf(2) + mbstd_num_channels, nf(2), kernel_size=3, activation=activation
+            )
+        )
+        self.Dis = nn.Sequential(*Dis)
+        self.fc0 = FullyConnectedLayer(nf(2) * 4 ** 2, nf(2), activation=activation)
+        self.fc1 = FullyConnectedLayer(nf(2), 1 if cmap_dim == 0 else cmap_dim)
+        # for 64x64
+        Dis_stg1 = [DisFromRGB(img_channels + 1, nf(resolution_log2) // 2, activation)]
+        for res in range(resolution_log2, 2, -1):
+            Dis_stg1.append(DisBlock(nf(res) // 2, nf(res - 1) // 2, activation))
+        if mbstd_num_channels > 0:
+            Dis_stg1.append(
+                MinibatchStdLayer(
+                    group_size=mbstd_group_size, num_channels=mbstd_num_channels
+                )
+            )
+        Dis_stg1.append(
+            Conv2dLayer(
+                nf(2) // 2 + mbstd_num_channels,
+                nf(2) // 2,
+                kernel_size=3,
+                activation=activation,
+            )
+        )
+        self.Dis_stg1 = nn.Sequential(*Dis_stg1)
+        self.fc0_stg1 = FullyConnectedLayer(
+            nf(2) // 2 * 4 ** 2, nf(2) // 2, activation=activation
+        )
+        self.fc1_stg1 = FullyConnectedLayer(
+            nf(2) // 2, 1 if cmap_dim == 0 else cmap_dim
+        )
+    def forward(self, images_in, masks_in, images_stg1, c):
+        x = self.Dis(torch.cat([masks_in - 0.5, images_in], dim=1))
+        x = self.fc1(self.fc0(x.flatten(start_dim=1)))
+        x_stg1 = self.Dis_stg1(torch.cat([masks_in - 0.5, images_stg1], dim=1))
+        x_stg1 = self.fc1_stg1(self.fc0_stg1(x_stg1.flatten(start_dim=1)))
+        if self.c_dim > 0:
+            cmap = self.mapping(None, c)
+        if self.cmap_dim > 0:
+            x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim))
+            x_stg1 = (x_stg1 * cmap).sum(dim=1, keepdim=True) * (
+                1 / np.sqrt(self.cmap_dim)
+            )
+        return x, x_stg1
+MAT_MODEL_URL = os.environ.get(
+    "MAT_MODEL_URL",
+    "https://github.com/Sanster/models/releases/download/add_mat/Places_512_FullData_G.pth",
+)
+MAT_MODEL_MD5 = os.environ.get("MAT_MODEL_MD5", "8ca927835fa3f5e21d65ffcb165377ed")
+class MAT(InpaintModel):
+    name = "mat"
+    min_size = 512
+    pad_mod = 512
+    pad_to_square = True
+    def init_model(self, device, **kwargs):
+        seed = 240  # pick up a random number
+        set_seed(seed)
+        fp16 = not kwargs.get("no_half", False)
+        use_gpu = "cuda" in str(device) and torch.cuda.is_available()
+        self.torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+        G = Generator(
+            z_dim=512,
+            c_dim=0,
+            w_dim=512,
+            img_resolution=512,
+            img_channels=3,
+            mapping_kwargs={"torch_dtype": self.torch_dtype},
+        ).to(self.torch_dtype)
+        # fmt: off
+        self.model = load_model(G, MAT_MODEL_URL, device, MAT_MODEL_MD5)
+        self.z = torch.from_numpy(np.random.randn(1, G.z_dim)).to(self.torch_dtype).to(device)
+        self.label = torch.zeros([1, self.model.c_dim], device=device).to(self.torch_dtype)
+        # fmt: on
+    @staticmethod
+    def is_downloaded() -> bool:
+        return os.path.exists(get_cache_path_by_url(MAT_MODEL_URL))
+    def forward(self, image, mask, config: Config):
+        """Input images and output images have same size
+        images: [H, W, C] RGB
+        masks: [H, W] mask area == 255
+        return: BGR IMAGE
+        """
+        image = norm_img(image)  # [0, 1]
+        image = image * 2 - 1  # [0, 1] -> [-1, 1]
+        mask = (mask > 127) * 255
+        mask = 255 - mask
+        mask = norm_img(mask)
+        image = (
+            torch.from_numpy(image).unsqueeze(0).to(self.torch_dtype).to(self.device)
+        )
+        mask = torch.from_numpy(mask).unsqueeze(0).to(self.torch_dtype).to(self.device)
+        output = self.model(
+            image, mask, self.z, self.label, truncation_psi=1, noise_mode="none"
+        )
+        output = (
+            (output.permute(0, 2, 3, 1) * 127.5 + 127.5)
+            .round()
+            .clamp(0, 255)
+            .to(torch.uint8)
+        )
+        output = output[0].cpu().numpy()
+        cur_res = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return cur_res

lama_cleaner/model/opencv2.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import cv2
+from lama_cleaner.model.base import InpaintModel
+from lama_cleaner.schema import Config
+flag_map = {"INPAINT_NS": cv2.INPAINT_NS, "INPAINT_TELEA": cv2.INPAINT_TELEA}
+class OpenCV2(InpaintModel):
+    name = "cv2"
+    pad_mod = 1
+    @staticmethod
+    def is_downloaded() -> bool:
+        return True
+    def forward(self, image, mask, config: Config):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1]
+        return: BGR IMAGE
+        """
+        cur_res = cv2.inpaint(
+            image[:, :, ::-1],
+            mask,
+            inpaintRadius=config.cv2_radius,
+            flags=flag_map[config.cv2_flag],
+        )
+        return cur_res

lama_cleaner/model/paint_by_example.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import PIL
+import PIL.Image
+import cv2
+import torch
+from diffusers import DiffusionPipeline
+from loguru import logger
+from lama_cleaner.model.base import DiffusionInpaintModel
+from lama_cleaner.model.utils import set_seed
+from lama_cleaner.schema import Config
+class PaintByExample(DiffusionInpaintModel):
+    name = "paint_by_example"
+    pad_mod = 8
+    min_size = 512
+    def init_model(self, device: torch.device, **kwargs):
+        fp16 = not kwargs.get('no_half', False)
+        use_gpu = device == torch.device('cuda') and torch.cuda.is_available()
+        torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32
+        model_kwargs = {"local_files_only": kwargs.get('local_files_only', False)}
+        if kwargs['disable_nsfw'] or kwargs.get('cpu_offload', False):
+            logger.info("Disable Paint By Example Model NSFW checker")
+            model_kwargs.update(dict(
+                safety_checker=None,
+                requires_safety_checker=False
+            ))
+        self.model = DiffusionPipeline.from_pretrained(
+            "Fantasy-Studio/Paint-by-Example",
+            torch_dtype=torch_dtype,
+            **model_kwargs
+        )
+        self.model.enable_attention_slicing()
+        if kwargs.get('enable_xformers', False):
+            self.model.enable_xformers_memory_efficient_attention()
+        # TODO: gpu_id
+        if kwargs.get('cpu_offload', False) and use_gpu:
+            self.model.image_encoder = self.model.image_encoder.to(device)
+            self.model.enable_sequential_cpu_offload(gpu_id=0)
+        else:
+            self.model = self.model.to(device)
+    def forward(self, image, mask, config: Config):
+        """Input image and output image have same size
+        image: [H, W, C] RGB
+        mask: [H, W, 1] 255 means area to repaint
+        return: BGR IMAGE
+        """
+        output = self.model(
+            image=PIL.Image.fromarray(image),
+            mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"),
+            example_image=config.paint_by_example_example_image,
+            num_inference_steps=config.paint_by_example_steps,
+            output_type='np.array',
+            generator=torch.manual_seed(config.paint_by_example_seed)
+        ).images[0]
+        output = (output * 255).round().astype("uint8")
+        output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
+        return output
+    def forward_post_process(self, result, image, mask, config):
+        if config.paint_by_example_match_histograms:
+            result = self._match_histograms(result, image[:, :, ::-1], mask)
+        if config.paint_by_example_mask_blur != 0:
+            k = 2 * config.paint_by_example_mask_blur + 1
+            mask = cv2.GaussianBlur(mask, (k, k), 0)
+        return result, image, mask
+    @staticmethod
+    def is_downloaded() -> bool:
+        # model will be downloaded when app start, and can't switch in frontend settings
+        return True

lama_cleaner/model/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .pipeline_stable_diffusion_controlnet_inpaint import (
+    StableDiffusionControlNetInpaintPipeline,
+)

lama_cleaner/model/pipeline/pipeline_stable_diffusion_controlnet_inpaint.py ADDED Viewed

	@@ -0,0 +1,585 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copy from https://github.com/mikonvergence/ControlNetInpaint/blob/main/src/pipeline_stable_diffusion_controlnet_inpaint.py
+import torch
+import PIL.Image
+import numpy as np
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import *
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+        >>> import cv2
+        >>> from PIL import Image
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        ... )
+        >>> image = np.array(image)
+        >>> mask_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        ... )
+        >>> mask_image = np.array(mask_image)
+        >>> # get canny image
+        >>> canny_image = cv2.Canny(image, 100, 200)
+        >>> canny_image = canny_image[:, :, None]
+        >>> canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
+        >>> canny_image = Image.fromarray(canny_image)
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+        >>> pipe.enable_model_cpu_offload()
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking doggo",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     image=image,
+        ...     control_image=canny_image,
+        ...     mask_image=mask_image
+        ... ).images[0]
+        ```
+"""
+def prepare_mask_and_masked_image(image, mask):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(
+                f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not"
+            )
+        # Batch single image
+        if image.ndim == 3:
+            assert (
+                image.shape[0] == 3
+            ), "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+        assert (
+            image.ndim == 4 and mask.ndim == 4
+        ), "Image and Mask must have 4 dimensions"
+        assert (
+            image.shape[-2:] == mask.shape[-2:]
+        ), "Image and Mask must have the same spatial dimensions"
+        assert (
+            image.shape[0] == mask.shape[0]
+        ), "Image and Mask must have the same batch size"
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(
+            f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not"
+        )
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = np.concatenate(
+                [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0
+            )
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+    masked_image = image * (mask < 0.5)
+    return mask, masked_image
+class StableDiffusionControlNetInpaintPipeline(StableDiffusionControlNetPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion with ControlNet guidance.
+    This model inherits from [`StableDiffusionControlNetPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`]):
+            Provides additional conditioning to the unet during the denoising process
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        batch_size,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        do_classifier_free_guidance,
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+        masked_image = masked_image.to(device=device, dtype=dtype)
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            masked_image_latents = [
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(
+                    generator=generator[i]
+                )
+                for i in range(batch_size)
+            ]
+            masked_image_latents = torch.cat(masked_image_latents, dim=0)
+        else:
+            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(
+                generator=generator
+            )
+        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(
+                batch_size // masked_image_latents.shape[0], 1, 1, 1
+            )
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2)
+            if do_classifier_free_guidance
+            else masked_image_latents
+        )
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        control_image: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+        ] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                also be accepted as an image. The control image is automatically resized to fit the output image.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, control_image)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            control_image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare image
+        control_image = self.prepare_image(
+            control_image,
+            width,
+            height,
+            batch_size * num_images_per_prompt,
+            num_images_per_prompt,
+            device,
+            self.controlnet.dtype,
+        )
+        if do_classifier_free_guidance:
+            control_image = torch.cat([control_image] * 2)
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 6. Prepare latent variables
+        num_channels_latents = self.controlnet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # EXTRA: prepare mask latents
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=control_image,
+                    return_dict=False,
+                )
+                down_block_res_samples = [
+                    down_block_res_sample * controlnet_conditioning_scale
+                    for down_block_res_sample in down_block_res_samples
+                ]
+                mid_block_res_sample *= controlnet_conditioning_scale
+                # predict the noise residual
+                latent_model_input = torch.cat(
+                    [latent_model_input, mask, masked_image_latents], dim=1
+                )
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

lama_cleaner/model/plms_sampler.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# From: https://github.com/CompVis/latent-diffusion/blob/main/ldm/models/diffusion/plms.py
+import torch
+import numpy as np
+from lama_cleaner.model.utils import make_ddim_timesteps, make_ddim_sampling_parameters, noise_like
+from tqdm import tqdm
+class PLMSSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+    def register_buffer(self, name, attr):
+        setattr(self, name, attr)
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+        if ddim_eta != 0:
+            raise ValueError('ddim_eta must be 0 for PLMS')
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
+                                                                                   ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta, verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                    1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+    @torch.no_grad()
+    def sample(self,
+               steps,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=False,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               **kwargs
+               ):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+        self.make_schedule(ddim_num_steps=steps, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for PLMS sampling is {size}')
+        samples = self.plms_sampling(conditioning, size,
+                                     callback=callback,
+                                     img_callback=img_callback,
+                                     quantize_denoised=quantize_x0,
+                                     mask=mask, x0=x0,
+                                     ddim_use_original_steps=False,
+                                     noise_dropout=noise_dropout,
+                                     temperature=temperature,
+                                     score_corrector=score_corrector,
+                                     corrector_kwargs=corrector_kwargs,
+                                     x_T=x_T,
+                                     log_every_t=log_every_t,
+                                     unconditional_guidance_scale=unconditional_guidance_scale,
+                                     unconditional_conditioning=unconditional_conditioning,
+                                     )
+        return samples
+    @torch.no_grad()
+    def plms_sampling(self, cond, shape,
+                      x_T=None, ddim_use_original_steps=False,
+                      callback=None, timesteps=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, log_every_t=100,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, ):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+        time_range = list(reversed(range(0, timesteps))) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        print(f"Running PLMS Sampling with {total_steps} timesteps")
+        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
+        old_eps = []
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+            outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      old_eps=old_eps, t_next=ts_next)
+            img, pred_x0, e_t = outs
+            old_eps.append(e_t)
+            if len(old_eps) >= 4:
+                old_eps.pop(0)
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+        return img
+    @torch.no_grad()
+    def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None):
+        b, *_, device = *x.shape, x.device
+        def get_model_output(x, t):
+            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+                e_t = self.model.apply_model(x, t, c)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t] * 2)
+                c_in = torch.cat([unconditional_conditioning, c])
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            if score_corrector is not None:
+                assert self.model.parameterization == "eps"
+                e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+            return e_t
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        def get_x_prev_and_pred_x0(e_t, index):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device)
+            # current prediction for x_0
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            if quantize_denoised:
+                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+            # direction pointing to x_t
+            dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
+            noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+            if noise_dropout > 0.:
+                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+            return x_prev, pred_x0
+        e_t = get_model_output(x, t)
+        if len(old_eps) == 0:
+            # Pseudo Improved Euler (2nd order)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
+            e_t_next = get_model_output(x_prev, t_next)
+            e_t_prime = (e_t + e_t_next) / 2
+        elif len(old_eps) == 1:
+            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (3 * e_t - old_eps[-1]) / 2
+        elif len(old_eps) == 2:
+            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
+        elif len(old_eps) >= 3:
+            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
+        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
+        return x_prev, pred_x0, e_t