Spaces:

CazC
/

smallville

Sleeping

App Files Files Community

CazC commited on Mar 12, 2024

Commit

9c98fd2

verified ·

1 Parent(s): 689b2fa

Upload 49 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
TripoSR +0 -1
TripoSR/.gitignore +164 -0
TripoSR/LICENSE +21 -0
TripoSR/README.md +80 -0
TripoSR/__pycache__/obj_gen.cpython-310.pyc +0 -0
TripoSR/examples/captured.jpeg +3 -0
TripoSR/examples/captured_p.png +3 -0
TripoSR/examples/chair.png +0 -0
TripoSR/examples/flamingo.png +0 -0
TripoSR/examples/hamburger.png +0 -0
TripoSR/examples/horse.png +0 -0
TripoSR/examples/iso_house.png +3 -0
TripoSR/examples/marble.png +0 -0
TripoSR/examples/police_woman.png +0 -0
TripoSR/examples/poly_fox.png +0 -0
TripoSR/examples/robot.png +0 -0
TripoSR/examples/stripes.png +0 -0
TripoSR/examples/teapot.png +0 -0
TripoSR/examples/tiger_girl.png +0 -0
TripoSR/examples/unicorn.png +0 -0
TripoSR/figures/comparison800.gif +3 -0
TripoSR/figures/scatter-comparison.png +0 -0
TripoSR/figures/teaser800.gif +3 -0
TripoSR/figures/visual_comparisons.jpg +3 -0
TripoSR/gradio_app.py +187 -0
TripoSR/obj_gen.py +92 -0
TripoSR/output/0/input.png +0 -0
TripoSR/output/0/mesh.obj +0 -0
TripoSR/requirements.txt +9 -0
TripoSR/run.py +162 -0
TripoSR/tsr/__pycache__/system.cpython-310.pyc +0 -0
TripoSR/tsr/__pycache__/utils.cpython-310.pyc +0 -0
TripoSR/tsr/models/__pycache__/isosurface.cpython-310.pyc +0 -0
TripoSR/tsr/models/__pycache__/nerf_renderer.cpython-310.pyc +0 -0
TripoSR/tsr/models/__pycache__/network_utils.cpython-310.pyc +0 -0
TripoSR/tsr/models/isosurface.py +52 -0
TripoSR/tsr/models/nerf_renderer.py +180 -0
TripoSR/tsr/models/network_utils.py +124 -0
TripoSR/tsr/models/tokenizers/__pycache__/image.cpython-310.pyc +0 -0
TripoSR/tsr/models/tokenizers/__pycache__/triplane.cpython-310.pyc +0 -0
TripoSR/tsr/models/tokenizers/image.py +66 -0
TripoSR/tsr/models/tokenizers/triplane.py +45 -0
TripoSR/tsr/models/transformer/__pycache__/attention.cpython-310.pyc +0 -0
TripoSR/tsr/models/transformer/__pycache__/basic_transformer_block.cpython-310.pyc +0 -0
TripoSR/tsr/models/transformer/__pycache__/transformer_1d.cpython-310.pyc +0 -0
TripoSR/tsr/models/transformer/attention.py +653 -0
TripoSR/tsr/models/transformer/basic_transformer_block.py +334 -0
TripoSR/tsr/models/transformer/transformer_1d.py +219 -0
TripoSR/tsr/system.py +203 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 wheel/torchmcubes-0.1.0-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+TripoSR/examples/captured_p.png filter=lfs diff=lfs merge=lfs -text
+TripoSR/examples/captured.jpeg filter=lfs diff=lfs merge=lfs -text
+TripoSR/examples/iso_house.png filter=lfs diff=lfs merge=lfs -text
+TripoSR/figures/comparison800.gif filter=lfs diff=lfs merge=lfs -text
+TripoSR/figures/teaser800.gif filter=lfs diff=lfs merge=lfs -text
+TripoSR/figures/visual_comparisons.jpg filter=lfs diff=lfs merge=lfs -text

TripoSR DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 8e51fec8095c9eae20e6ea7c9aef6368c5631a21

TripoSR/.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# default output directory
+output/
+outputs/

TripoSR/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Tripo AI & Stability AI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

TripoSR/README.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# TripoSR <a href="https://huggingface.co/stabilityai/TripoSR"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange"></a> <a href="https://huggingface.co/spaces/stabilityai/TripoSR"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Gradio%20Demo-Huggingface-orange"></a> <a href="https://arxiv.org/abs/2403.02151"><img src="https://img.shields.io/badge/Arxiv-2403.02151-B31B1B.svg"></a>
+<div align="center">
+  <img src="figures/teaser800.gif" alt="Teaser Video">
+</div>
+This is the official codebase for **TripoSR**, a state-of-the-art open-source model for **fast** feedforward 3D reconstruction from a single image, collaboratively developed by [Tripo AI](https://www.tripo3d.ai/) and [Stability AI](https://stability.ai/).
+<br><br>
+Leveraging the principles of the [Large Reconstruction Model (LRM)](https://yiconghong.me/LRM/), TripoSR brings to the table key advancements that significantly boost both the speed and quality of 3D reconstruction. Our model is distinguished by its ability to rapidly process inputs, generating high-quality 3D models in less than 0.5 seconds on an NVIDIA A100 GPU. TripoSR has exhibited superior performance in both qualitative and quantitative evaluations, outperforming other open-source alternatives across multiple public datasets. The figures below illustrate visual comparisons and metrics showcasing TripoSR's performance relative to other leading models. Details about the model architecture, training process, and comparisons can be found in this [technical report](https://arxiv.org/abs/2403.02151).
+<!--
+<div align="center">
+  <img src="figures/comparison800.gif" alt="Teaser Video">
+</div>
+-->
+<p align="center">
+    <img width="800" src="figures/visual_comparisons.jpg"/>
+</p>
+<p align="center">
+    <img width="450" src="figures/scatter-comparison.png"/>
+</p>
+The model is released under the MIT license, which includes the source code, pretrained models, and an interactive online demo. Our goal is to empower researchers, developers, and creatives to push the boundaries of what's possible in 3D generative AI and 3D content creation.
+## Getting Started
+### Installation
+- Python >= 3.8
+- Install CUDA if available
+- Install PyTorch according to your platform: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) **[Please make sure that the locally-installed CUDA major version matches the PyTorch-shipped CUDA major version. For example if you have CUDA 11.x installed, make sure to install PyTorch compiled with CUDA 11.x.]**
+- Update setuptools by `pip install --upgrade setuptools`
+- Install other dependencies by `pip install -r requirements.txt`
+### Manual Inference
+```sh
+python run.py examples/chair.png --output-dir output/
+```
+This will save the reconstructed 3D model to `output/`. You can also specify more than one image path separated by spaces. The default options takes about **6GB VRAM** for a single image input.
+For detailed usage of this script, use `python run.py --help`.
+### Local Gradio App
+Install Gradio:
+```sh
+pip install gradio
+```
+Start the Gradio App:
+```sh
+python gradio_app.py
+```
+## Troubleshooting
+> AttributeError: module 'torchmcubes_module' has no attribute 'mcubes_cuda'
+or
+> torchmcubes was not compiled with CUDA support, use CPU version instead.
+This is because `torchmcubes` is compiled without CUDA support. Please make sure that
+- The locally-installed CUDA major version matches the PyTorch-shipped CUDA major version. For example if you have CUDA 11.x installed, make sure to install PyTorch compiled with CUDA 11.x.
+- `setuptools>=49.6.0`. If not, upgrade by `pip install --upgrade setuptools`.
+Then re-install `torchmcubes` by:
+```sh
+pip uninstall torchmcubes
+pip install git+https://github.com/tatsy/torchmcubes.git
+```
+## Citation
+```BibTeX
+@article{TripoSR2024,
+  title={TripoSR: Fast 3D Object Reconstruction from a Single Image},
+  author={Tochilkin, Dmitry and Pankratz, David and Liu, Zexiang and Huang, Zixuan and and Letts, Adam and Li, Yangguang and Liang, Ding and Laforte, Christian and Jampani, Varun and Cao, Yan-Pei},
+  journal={arXiv preprint arXiv:2403.02151},
+  year={2024}
+}
+```

TripoSR/__pycache__/obj_gen.cpython-310.pyc ADDED Viewed

Binary file (2.39 kB). View file

TripoSR/examples/captured.jpeg ADDED Viewed

Git LFS Details

SHA256: c6eb2768703a0e3d6034daa7fd5e0b286450b1077a90f36da8110749bb1cb8a8
Pointer size: 132 Bytes
Size of remote file: 5.94 MB

TripoSR/examples/captured_p.png ADDED Viewed

Git LFS Details

SHA256: 32d34c2f3a7d0ff7a5002ed98e0866679918cd6e2acf08ffbf6c50173a9f1c6e
Pointer size: 132 Bytes
Size of remote file: 2.35 MB

TripoSR/examples/chair.png ADDED Viewed

TripoSR/examples/flamingo.png ADDED Viewed

TripoSR/examples/hamburger.png ADDED Viewed

TripoSR/examples/horse.png ADDED Viewed

TripoSR/examples/iso_house.png ADDED Viewed

Git LFS Details

SHA256: e48818cf2d5edc15d4da67467c6e82cf08b0d4e24591b60ca0572e16f09f98c1
Pointer size: 132 Bytes
Size of remote file: 1.23 MB

TripoSR/examples/marble.png ADDED Viewed

TripoSR/examples/police_woman.png ADDED Viewed

TripoSR/examples/poly_fox.png ADDED Viewed

TripoSR/examples/robot.png ADDED Viewed

TripoSR/examples/stripes.png ADDED Viewed

TripoSR/examples/teapot.png ADDED Viewed

TripoSR/examples/tiger_girl.png ADDED Viewed

TripoSR/examples/unicorn.png ADDED Viewed

TripoSR/figures/comparison800.gif ADDED Viewed

Git LFS Details

SHA256: 887e69297e4446f122801ff2cc39962eda0933906d7ed7be7abf659e721914be
Pointer size: 132 Bytes
Size of remote file: 8.87 MB

TripoSR/figures/scatter-comparison.png ADDED Viewed

TripoSR/figures/teaser800.gif ADDED Viewed

Git LFS Details

SHA256: 52ecc6ff24e008b0d28236425a1b59718931841f6fb9f5e6f8471829fc9bc292
Pointer size: 132 Bytes
Size of remote file: 3.84 MB

TripoSR/figures/visual_comparisons.jpg ADDED Viewed

Git LFS Details

SHA256: 019235d716d8832aaa659acd31cf17267af94df6b5a9beca2a7002b41d59c8db
Pointer size: 133 Bytes
Size of remote file: 10.3 MB

TripoSR/gradio_app.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import logging
+import os
+import tempfile
+import time
+import gradio as gr
+import numpy as np
+import rembg
+import torch
+from PIL import Image
+from functools import partial
+from tsr.system import TSR
+from tsr.utils import remove_background, resize_foreground, to_gradio_3d_orientation
+import argparse
+if torch.cuda.is_available():
+    device = "cuda:0"
+else:
+    device = "cpu"
+model = TSR.from_pretrained(
+    "stabilityai/TripoSR",
+    config_name="config.yaml",
+    weight_name="model.ckpt",
+)
+# adjust the chunk size to balance between speed and memory usage
+model.renderer.set_chunk_size(8192)
+model.to(device)
+rembg_session = rembg.new_session()
+def check_input_image(input_image):
+    if input_image is None:
+        raise gr.Error("No image uploaded!")
+def preprocess(input_image, do_remove_background, foreground_ratio):
+    def fill_background(image):
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+        image = Image.fromarray((image * 255.0).astype(np.uint8))
+        return image
+    if do_remove_background:
+        image = input_image.convert("RGB")
+        image = remove_background(image, rembg_session)
+        image = resize_foreground(image, foreground_ratio)
+        image = fill_background(image)
+    else:
+        image = input_image
+        if image.mode == "RGBA":
+            image = fill_background(image)
+    return image
+def generate(image, mc_resolution, formats=["obj", "glb"]):
+    scene_codes = model(image, device=device)
+    mesh = model.extract_mesh(scene_codes, resolution=mc_resolution)[0]
+    mesh = to_gradio_3d_orientation(mesh)
+    rv = []
+    for format in formats:
+        mesh_path = tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False)
+        mesh.export(mesh_path.name)
+        rv.append(mesh_path.name)
+    return rv
+def run_example(image_pil):
+    preprocessed = preprocess(image_pil, False, 0.9)
+    mesh_name_obj, mesh_name_glb = generate(preprocessed, 256, ["obj", "glb"])
+    return preprocessed, mesh_name_obj, mesh_name_glb
+with gr.Blocks(title="TripoSR") as interface:
+    gr.Markdown(
+        """
+    # TripoSR Demo
+    [TripoSR](https://github.com/VAST-AI-Research/TripoSR) is a state-of-the-art open-source model for **fast** feedforward 3D reconstruction from a single image, collaboratively developed by [Tripo AI](https://www.tripo3d.ai/) and [Stability AI](https://stability.ai/).
+    **Tips:**
+    1. If you find the result is unsatisfied, please try to change the foreground ratio. It might improve the results.
+    2. You can disable "Remove Background" for the provided examples since they have been already preprocessed.
+    3. Otherwise, please disable "Remove Background" option only if your input image is RGBA with transparent background, image contents are centered and occupy more than 70% of image width or height.
+    """
+    )
+    with gr.Row(variant="panel"):
+        with gr.Column():
+            with gr.Row():
+                input_image = gr.Image(
+                    label="Input Image",
+                    image_mode="RGBA",
+                    sources="upload",
+                    type="pil",
+                    elem_id="content_image",
+                )
+                processed_image = gr.Image(label="Processed Image", interactive=False)
+            with gr.Row():
+                with gr.Group():
+                    do_remove_background = gr.Checkbox(
+                        label="Remove Background", value=True
+                    )
+                    foreground_ratio = gr.Slider(
+                        label="Foreground Ratio",
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=0.85,
+                        step=0.05,
+                    )
+                    mc_resolution = gr.Slider(
+                        label="Marching Cubes Resolution",
+                        minimum=32,
+                        maximum=320,
+                        value=256,
+                        step=32
+                    )
+            with gr.Row():
+                submit = gr.Button("Generate", elem_id="generate", variant="primary")
+        with gr.Column():
+            with gr.Tab("OBJ"):
+                output_model_obj = gr.Model3D(
+                    label="Output Model (OBJ Format)",
+                    interactive=False,
+                )
+                gr.Markdown("Note: The model shown here is flipped. Download to get correct results.")
+            with gr.Tab("GLB"):
+                output_model_glb = gr.Model3D(
+                    label="Output Model (GLB Format)",
+                    interactive=False,
+                )
+                gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.")
+    with gr.Row(variant="panel"):
+        gr.Examples(
+            examples=[
+                "examples/hamburger.png",
+                "examples/poly_fox.png",
+                "examples/robot.png",
+                "examples/teapot.png",
+                "examples/tiger_girl.png",
+                "examples/horse.png",
+                "examples/flamingo.png",
+                "examples/unicorn.png",
+                "examples/chair.png",
+                "examples/iso_house.png",
+                "examples/marble.png",
+                "examples/police_woman.png",
+                "examples/captured_p.png",
+            ],
+            inputs=[input_image],
+            outputs=[processed_image, output_model_obj, output_model_glb],
+            cache_examples=False,
+            fn=partial(run_example),
+            label="Examples",
+            examples_per_page=20,
+        )
+    submit.click(fn=check_input_image, inputs=[input_image]).success(
+        fn=preprocess,
+        inputs=[input_image, do_remove_background, foreground_ratio],
+        outputs=[processed_image],
+    ).success(
+        fn=generate,
+        inputs=[processed_image, mc_resolution],
+        outputs=[output_model_obj, output_model_glb],
+    )
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--username', type=str, default=None, help='Username for authentication')
+    parser.add_argument('--password', type=str, default=None, help='Password for authentication')
+    parser.add_argument('--port', type=int, default=7860, help='Port to run the server listener on')
+    parser.add_argument("--listen", action='store_true', help="launch gradio with 0.0.0.0 as server name, allowing to respond to network requests")
+    parser.add_argument("--share", action='store_true', help="use share=True for gradio and make the UI accessible through their site")
+    parser.add_argument("--queuesize", type=int, default=1, help="launch gradio queue max_size")
+    args = parser.parse_args()
+    interface.queue(max_size=args.queuesize)
+    interface.launch(
+        auth=(args.username, args.password) if (args.username and args.password) else None,
+        share=args.share,
+        server_name="0.0.0.0" if args.listen else None,
+        server_port=args.port
+    )

TripoSR/obj_gen.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import logging
+import os
+import tempfile
+import time
+import numpy as np
+import rembg
+import torch
+from PIL import Image
+from functools import partial
+from tsr.system import TSR
+from tsr.utils import remove_background, resize_foreground, to_gradio_3d_orientation
+import argparse
+from dotenv import load_dotenv
+load_dotenv()
+device = "cpu"
+model = TSR.from_pretrained(
+    "stabilityai/TripoSR",
+    config_name="config.yaml",
+    weight_name="model.ckpt",
+)
+# adjust the chunk size to balance between speed and memory usage
+model.renderer.set_chunk_size(8192)
+model.to(device)
+rembg_session = rembg.new_session()
+def preprocess(input_image, do_remove_background, foreground_ratio):
+    def fill_background(image):
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+        image = Image.fromarray((image * 255.0).astype(np.uint8))
+        return image
+    if do_remove_background:
+        image = input_image.convert("RGB")
+        image = remove_background(image, rembg_session)
+        image = resize_foreground(image, foreground_ratio)
+        image = fill_background(image)
+    else:
+        image = input_image
+        if image.mode == "RGBA":
+            image = fill_background(image)
+    return image
+def generate(image, mc_resolution, formats=["obj", "glb"], path="output.obj"):
+    scene_codes = model(image, device=device)
+    mesh = model.extract_mesh(scene_codes, resolution=mc_resolution)[0]
+    mesh = to_gradio_3d_orientation(mesh)
+    rv = []
+    for format in formats:
+        mesh_path = path.replace(".obj", f".{format}")
+        mesh.export(mesh_path)
+        rv.append(mesh_path)
+    return rv
+def run_example(image_pil):
+    preprocessed = preprocess(image_pil, False, 0.9)
+    mesh_name_obj, mesh_name_glb = generate(preprocessed, 256, ["obj", "glb"])
+    return preprocessed, mesh_name_obj, mesh_name_glb
+def generate_obj_from_image(image_pil, path="output.obj"):
+    # Preprocess the image without removing the background and with a foreground ratio of 0.9
+    preprocessed = preprocess(image_pil, True, 0.9)
+    # Generate the mesh and get the paths to the .obj and .glb files
+    mesh_paths = generate(preprocessed, 256, ["obj"], path)
+    # Return the path to the .obj file
+    return mesh_paths[0]
+if __name__ == "__main__":
+    # run a test
+    image_path = "output.png"
+    image = Image.open(image_path)
+    generate_obj_from_image(image, "output.obj")
+    # move the .obj file to the output directory

TripoSR/output/0/input.png ADDED Viewed

TripoSR/output/0/mesh.obj ADDED Viewed

The diff for this file is too large to render. See raw diff

TripoSR/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+omegaconf==2.3.0
+Pillow==10.1.0
+einops==0.7.0
+git+https://github.com/tatsy/torchmcubes.git
+transformers==4.35.0
+trimesh==4.0.5
+rembg
+huggingface-hub
+imageio[ffmpeg]

TripoSR/run.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import argparse
+import logging
+import os
+import time
+import numpy as np
+import rembg
+import torch
+from PIL import Image
+from tsr.system import TSR
+from tsr.utils import remove_background, resize_foreground, save_video
+class Timer:
+    def __init__(self):
+        self.items = {}
+        self.time_scale = 1000.0  # ms
+        self.time_unit = "ms"
+    def start(self, name: str) -> None:
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        self.items[name] = time.time()
+        logging.info(f"{name} ...")
+    def end(self, name: str) -> float:
+        if name not in self.items:
+            return
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        start_time = self.items.pop(name)
+        delta = time.time() - start_time
+        t = delta * self.time_scale
+        logging.info(f"{name} finished in {t:.2f}{self.time_unit}.")
+timer = Timer()
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+parser = argparse.ArgumentParser()
+parser.add_argument("image", type=str, nargs="+", help="Path to input image(s).")
+parser.add_argument(
+    "--device",
+    default="cuda:0",
+    type=str,
+    help="Device to use. If no CUDA-compatible device is found, will fallback to 'cpu'. Default: 'cuda:0'",
+)
+parser.add_argument(
+    "--pretrained-model-name-or-path",
+    default="stabilityai/TripoSR",
+    type=str,
+    help="Path to the pretrained model. Could be either a huggingface model id is or a local path. Default: 'stabilityai/TripoSR'",
+)
+parser.add_argument(
+    "--chunk-size",
+    default=8192,
+    type=int,
+    help="Evaluation chunk size for surface extraction and rendering. Smaller chunk size reduces VRAM usage but increases computation time. 0 for no chunking. Default: 8192",
+)
+parser.add_argument(
+    "--mc-resolution",
+    default=256,
+    type=int,
+    help="Marching cubes grid resolution. Default: 256"
+)
+parser.add_argument(
+    "--no-remove-bg",
+    action="store_true",
+    help="If specified, the background will NOT be automatically removed from the input image, and the input image should be an RGB image with gray background and properly-sized foreground. Default: false",
+)
+parser.add_argument(
+    "--foreground-ratio",
+    default=0.85,
+    type=float,
+    help="Ratio of the foreground size to the image size. Only used when --no-remove-bg is not specified. Default: 0.85",
+)
+parser.add_argument(
+    "--output-dir",
+    default="output/",
+    type=str,
+    help="Output directory to save the results. Default: 'output/'",
+)
+parser.add_argument(
+    "--model-save-format",
+    default="obj",
+    type=str,
+    choices=["obj", "glb"],
+    help="Format to save the extracted mesh. Default: 'obj'",
+)
+parser.add_argument(
+    "--render",
+    action="store_true",
+    help="If specified, save a NeRF-rendered video. Default: false",
+)
+args = parser.parse_args()
+output_dir = args.output_dir
+os.makedirs(output_dir, exist_ok=True)
+device = args.device
+if not torch.cuda.is_available():
+    device = "cpu"
+timer.start("Initializing model")
+model = TSR.from_pretrained(
+    args.pretrained_model_name_or_path,
+    config_name="config.yaml",
+    weight_name="model.ckpt",
+)
+model.renderer.set_chunk_size(args.chunk_size)
+model.to(device)
+timer.end("Initializing model")
+timer.start("Processing images")
+images = []
+if args.no_remove_bg:
+    rembg_session = None
+else:
+    rembg_session = rembg.new_session()
+for i, image_path in enumerate(args.image):
+    if args.no_remove_bg:
+        image = np.array(Image.open(image_path).convert("RGB"))
+    else:
+        image = remove_background(Image.open(image_path), rembg_session)
+        image = resize_foreground(image, args.foreground_ratio)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
+        image = Image.fromarray((image * 255.0).astype(np.uint8))
+        if not os.path.exists(os.path.join(output_dir, str(i))):
+            os.makedirs(os.path.join(output_dir, str(i)))
+        image.save(os.path.join(output_dir, str(i), f"input.png"))
+    images.append(image)
+timer.end("Processing images")
+for i, image in enumerate(images):
+    logging.info(f"Running image {i + 1}/{len(images)} ...")
+    timer.start("Running model")
+    with torch.no_grad():
+        scene_codes = model([image], device=device)
+    timer.end("Running model")
+    if args.render:
+        timer.start("Rendering")
+        render_images = model.render(scene_codes, n_views=30, return_type="pil")
+        for ri, render_image in enumerate(render_images[0]):
+            render_image.save(os.path.join(output_dir, str(i), f"render_{ri:03d}.png"))
+        save_video(
+            render_images[0], os.path.join(output_dir, str(i), f"render.mp4"), fps=30
+        )
+        timer.end("Rendering")
+    timer.start("Exporting mesh")
+    meshes = model.extract_mesh(scene_codes, resolution=args.mc_resolution)
+    meshes[0].export(os.path.join(output_dir, str(i), f"mesh.{args.model_save_format}"))
+    timer.end("Exporting mesh")

TripoSR/tsr/__pycache__/system.cpython-310.pyc ADDED Viewed

Binary file (5.15 kB). View file

TripoSR/tsr/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (13.5 kB). View file

TripoSR/tsr/models/__pycache__/isosurface.cpython-310.pyc ADDED Viewed

Binary file (2.23 kB). View file

TripoSR/tsr/models/__pycache__/nerf_renderer.cpython-310.pyc ADDED Viewed

Binary file (5.28 kB). View file

TripoSR/tsr/models/__pycache__/network_utils.cpython-310.pyc ADDED Viewed

Binary file (3.41 kB). View file

TripoSR/tsr/models/isosurface.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import Callable, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from torchmcubes import marching_cubes
+class IsosurfaceHelper(nn.Module):
+    points_range: Tuple[float, float] = (0, 1)
+    @property
+    def grid_vertices(self) -> torch.FloatTensor:
+        raise NotImplementedError
+class MarchingCubeHelper(IsosurfaceHelper):
+    def __init__(self, resolution: int) -> None:
+        super().__init__()
+        self.resolution = resolution
+        self.mc_func: Callable = marching_cubes
+        self._grid_vertices: Optional[torch.FloatTensor] = None
+    @property
+    def grid_vertices(self) -> torch.FloatTensor:
+        if self._grid_vertices is None:
+            # keep the vertices on CPU so that we can support very large resolution
+            x, y, z = (
+                torch.linspace(*self.points_range, self.resolution),
+                torch.linspace(*self.points_range, self.resolution),
+                torch.linspace(*self.points_range, self.resolution),
+            )
+            x, y, z = torch.meshgrid(x, y, z, indexing="ij")
+            verts = torch.cat(
+                [x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)], dim=-1
+            ).reshape(-1, 3)
+            self._grid_vertices = verts
+        return self._grid_vertices
+    def forward(
+        self,
+        level: torch.FloatTensor,
+    ) -> Tuple[torch.FloatTensor, torch.LongTensor]:
+        level = -level.view(self.resolution, self.resolution, self.resolution)
+        try:
+            v_pos, t_pos_idx = self.mc_func(level.detach(), 0.0)
+        except AttributeError:
+            print("torchmcubes was not compiled with CUDA support, use CPU version instead.")
+            v_pos, t_pos_idx = self.mc_func(level.detach().cpu(), 0.0)
+        v_pos = v_pos[..., [2, 1, 0]]
+        v_pos = v_pos / (self.resolution - 1.0)
+        return v_pos.to(level.device), t_pos_idx.to(level.device)

TripoSR/tsr/models/nerf_renderer.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from dataclasses import dataclass
+from typing import Dict
+import torch
+import torch.nn.functional as F
+from einops import rearrange, reduce
+from ..utils import (
+    BaseModule,
+    chunk_batch,
+    get_activation,
+    rays_intersect_bbox,
+    scale_tensor,
+)
+class TriplaneNeRFRenderer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        radius: float
+        feature_reduction: str = "concat"
+        density_activation: str = "trunc_exp"
+        density_bias: float = -1.0
+        color_activation: str = "sigmoid"
+        num_samples_per_ray: int = 128
+        randomized: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        assert self.cfg.feature_reduction in ["concat", "mean"]
+        self.chunk_size = 0
+    def set_chunk_size(self, chunk_size: int):
+        assert (
+            chunk_size >= 0
+        ), "chunk_size must be a non-negative integer (0 for no chunking)."
+        self.chunk_size = chunk_size
+    def query_triplane(
+        self,
+        decoder: torch.nn.Module,
+        positions: torch.Tensor,
+        triplane: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        input_shape = positions.shape[:-1]
+        positions = positions.view(-1, 3)
+        # positions in (-radius, radius)
+        # normalized to (-1, 1) for grid sample
+        positions = scale_tensor(
+            positions, (-self.cfg.radius, self.cfg.radius), (-1, 1)
+        )
+        def _query_chunk(x):
+            indices2D: torch.Tensor = torch.stack(
+                (x[..., [0, 1]], x[..., [0, 2]], x[..., [1, 2]]),
+                dim=-3,
+            )
+            out: torch.Tensor = F.grid_sample(
+                rearrange(triplane, "Np Cp Hp Wp -> Np Cp Hp Wp", Np=3),
+                rearrange(indices2D, "Np N Nd -> Np () N Nd", Np=3),
+                align_corners=False,
+                mode="bilinear",
+            )
+            if self.cfg.feature_reduction == "concat":
+                out = rearrange(out, "Np Cp () N -> N (Np Cp)", Np=3)
+            elif self.cfg.feature_reduction == "mean":
+                out = reduce(out, "Np Cp () N -> N Cp", Np=3, reduction="mean")
+            else:
+                raise NotImplementedError
+            net_out: Dict[str, torch.Tensor] = decoder(out)
+            return net_out
+        if self.chunk_size > 0:
+            net_out = chunk_batch(_query_chunk, self.chunk_size, positions)
+        else:
+            net_out = _query_chunk(positions)
+        net_out["density_act"] = get_activation(self.cfg.density_activation)(
+            net_out["density"] + self.cfg.density_bias
+        )
+        net_out["color"] = get_activation(self.cfg.color_activation)(
+            net_out["features"]
+        )
+        net_out = {k: v.view(*input_shape, -1) for k, v in net_out.items()}
+        return net_out
+    def _forward(
+        self,
+        decoder: torch.nn.Module,
+        triplane: torch.Tensor,
+        rays_o: torch.Tensor,
+        rays_d: torch.Tensor,
+        **kwargs,
+    ):
+        rays_shape = rays_o.shape[:-1]
+        rays_o = rays_o.view(-1, 3)
+        rays_d = rays_d.view(-1, 3)
+        n_rays = rays_o.shape[0]
+        t_near, t_far, rays_valid = rays_intersect_bbox(rays_o, rays_d, self.cfg.radius)
+        t_near, t_far = t_near[rays_valid], t_far[rays_valid]
+        t_vals = torch.linspace(
+            0, 1, self.cfg.num_samples_per_ray + 1, device=triplane.device
+        )
+        t_mid = (t_vals[:-1] + t_vals[1:]) / 2.0
+        z_vals = t_near * (1 - t_mid[None]) + t_far * t_mid[None]  # (N_rays, N_samples)
+        xyz = (
+            rays_o[:, None, :] + z_vals[..., None] * rays_d[..., None, :]
+        )  # (N_rays, N_sample, 3)
+        mlp_out = self.query_triplane(
+            decoder=decoder,
+            positions=xyz,
+            triplane=triplane,
+        )
+        eps = 1e-10
+        # deltas = z_vals[:, 1:] - z_vals[:, :-1] # (N_rays, N_samples)
+        deltas = t_vals[1:] - t_vals[:-1]  # (N_rays, N_samples)
+        alpha = 1 - torch.exp(
+            -deltas * mlp_out["density_act"][..., 0]
+        )  # (N_rays, N_samples)
+        accum_prod = torch.cat(
+            [
+                torch.ones_like(alpha[:, :1]),
+                torch.cumprod(1 - alpha[:, :-1] + eps, dim=-1),
+            ],
+            dim=-1,
+        )
+        weights = alpha * accum_prod  # (N_rays, N_samples)
+        comp_rgb_ = (weights[..., None] * mlp_out["color"]).sum(dim=-2)  # (N_rays, 3)
+        opacity_ = weights.sum(dim=-1)  # (N_rays)
+        comp_rgb = torch.zeros(
+            n_rays, 3, dtype=comp_rgb_.dtype, device=comp_rgb_.device
+        )
+        opacity = torch.zeros(n_rays, dtype=opacity_.dtype, device=opacity_.device)
+        comp_rgb[rays_valid] = comp_rgb_
+        opacity[rays_valid] = opacity_
+        comp_rgb += 1 - opacity[..., None]
+        comp_rgb = comp_rgb.view(*rays_shape, 3)
+        return comp_rgb
+    def forward(
+        self,
+        decoder: torch.nn.Module,
+        triplane: torch.Tensor,
+        rays_o: torch.Tensor,
+        rays_d: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        if triplane.ndim == 4:
+            comp_rgb = self._forward(decoder, triplane, rays_o, rays_d)
+        else:
+            comp_rgb = torch.stack(
+                [
+                    self._forward(decoder, triplane[i], rays_o[i], rays_d[i])
+                    for i in range(triplane.shape[0])
+                ],
+                dim=0,
+            )
+        return comp_rgb
+    def train(self, mode=True):
+        self.randomized = mode and self.cfg.randomized
+        return super().train(mode=mode)
+    def eval(self):
+        self.randomized = False
+        return super().eval()

TripoSR/tsr/models/network_utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from ..utils import BaseModule
+class TriplaneUpsampleNetwork(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int
+        out_channels: int
+    cfg: Config
+    def configure(self) -> None:
+        self.upsample = nn.ConvTranspose2d(
+            self.cfg.in_channels, self.cfg.out_channels, kernel_size=2, stride=2
+        )
+    def forward(self, triplanes: torch.Tensor) -> torch.Tensor:
+        triplanes_up = rearrange(
+            self.upsample(
+                rearrange(triplanes, "B Np Ci Hp Wp -> (B Np) Ci Hp Wp", Np=3)
+            ),
+            "(B Np) Co Hp Wp -> B Np Co Hp Wp",
+            Np=3,
+        )
+        return triplanes_up
+class NeRFMLP(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        in_channels: int
+        n_neurons: int
+        n_hidden_layers: int
+        activation: str = "relu"
+        bias: bool = True
+        weight_init: Optional[str] = "kaiming_uniform"
+        bias_init: Optional[str] = None
+    cfg: Config
+    def configure(self) -> None:
+        layers = [
+            self.make_linear(
+                self.cfg.in_channels,
+                self.cfg.n_neurons,
+                bias=self.cfg.bias,
+                weight_init=self.cfg.weight_init,
+                bias_init=self.cfg.bias_init,
+            ),
+            self.make_activation(self.cfg.activation),
+        ]
+        for i in range(self.cfg.n_hidden_layers - 1):
+            layers += [
+                self.make_linear(
+                    self.cfg.n_neurons,
+                    self.cfg.n_neurons,
+                    bias=self.cfg.bias,
+                    weight_init=self.cfg.weight_init,
+                    bias_init=self.cfg.bias_init,
+                ),
+                self.make_activation(self.cfg.activation),
+            ]
+        layers += [
+            self.make_linear(
+                self.cfg.n_neurons,
+                4,  # density 1 + features 3
+                bias=self.cfg.bias,
+                weight_init=self.cfg.weight_init,
+                bias_init=self.cfg.bias_init,
+            )
+        ]
+        self.layers = nn.Sequential(*layers)
+    def make_linear(
+        self,
+        dim_in,
+        dim_out,
+        bias=True,
+        weight_init=None,
+        bias_init=None,
+    ):
+        layer = nn.Linear(dim_in, dim_out, bias=bias)
+        if weight_init is None:
+            pass
+        elif weight_init == "kaiming_uniform":
+            torch.nn.init.kaiming_uniform_(layer.weight, nonlinearity="relu")
+        else:
+            raise NotImplementedError
+        if bias:
+            if bias_init is None:
+                pass
+            elif bias_init == "zero":
+                torch.nn.init.zeros_(layer.bias)
+            else:
+                raise NotImplementedError
+        return layer
+    def make_activation(self, activation):
+        if activation == "relu":
+            return nn.ReLU(inplace=True)
+        elif activation == "silu":
+            return nn.SiLU(inplace=True)
+        else:
+            raise NotImplementedError
+    def forward(self, x):
+        inp_shape = x.shape[:-1]
+        x = x.reshape(-1, x.shape[-1])
+        features = self.layers(x)
+        features = features.reshape(*inp_shape, -1)
+        out = {"density": features[..., 0:1], "features": features[..., 1:4]}
+        return out

TripoSR/tsr/models/tokenizers/__pycache__/image.cpython-310.pyc ADDED Viewed

Binary file (2.38 kB). View file

TripoSR/tsr/models/tokenizers/__pycache__/triplane.cpython-310.pyc ADDED Viewed

Binary file (1.76 kB). View file

TripoSR/tsr/models/tokenizers/image.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from transformers.models.vit.modeling_vit import ViTModel
+from ...utils import BaseModule
+class DINOSingleImageTokenizer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: str = "facebook/dino-vitb16"
+        enable_gradient_checkpointing: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        self.model: ViTModel = ViTModel(
+            ViTModel.config_class.from_pretrained(
+                hf_hub_download(
+                    repo_id=self.cfg.pretrained_model_name_or_path,
+                    filename="config.json",
+                )
+            )
+        )
+        if self.cfg.enable_gradient_checkpointing:
+            self.model.encoder.gradient_checkpointing = True
+        self.register_buffer(
+            "image_mean",
+            torch.as_tensor([0.485, 0.456, 0.406]).reshape(1, 1, 3, 1, 1),
+            persistent=False,
+        )
+        self.register_buffer(
+            "image_std",
+            torch.as_tensor([0.229, 0.224, 0.225]).reshape(1, 1, 3, 1, 1),
+            persistent=False,
+        )
+    def forward(self, images: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
+        packed = False
+        if images.ndim == 4:
+            packed = True
+            images = images.unsqueeze(1)
+        batch_size, n_input_views = images.shape[:2]
+        images = (images - self.image_mean) / self.image_std
+        out = self.model(
+            rearrange(images, "B N C H W -> (B N) C H W"), interpolate_pos_encoding=True
+        )
+        local_features, global_features = out.last_hidden_state, out.pooler_output
+        local_features = local_features.permute(0, 2, 1)
+        local_features = rearrange(
+            local_features, "(B N) Ct Nt -> B N Ct Nt", B=batch_size
+        )
+        if packed:
+            local_features = local_features.squeeze(1)
+        return local_features
+    def detokenize(self, *args, **kwargs):
+        raise NotImplementedError

TripoSR/tsr/models/tokenizers/triplane.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import math
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from ...utils import BaseModule
+class Triplane1DTokenizer(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        plane_size: int
+        num_channels: int
+    cfg: Config
+    def configure(self) -> None:
+        self.embeddings = nn.Parameter(
+            torch.randn(
+                (3, self.cfg.num_channels, self.cfg.plane_size, self.cfg.plane_size),
+                dtype=torch.float32,
+            )
+            * 1
+            / math.sqrt(self.cfg.num_channels)
+        )
+    def forward(self, batch_size: int) -> torch.Tensor:
+        return rearrange(
+            repeat(self.embeddings, "Np Ct Hp Wp -> B Np Ct Hp Wp", B=batch_size),
+            "B Np Ct Hp Wp -> B Ct (Np Hp Wp)",
+        )
+    def detokenize(self, tokens: torch.Tensor) -> torch.Tensor:
+        batch_size, Ct, Nt = tokens.shape
+        assert Nt == self.cfg.plane_size**2 * 3
+        assert Ct == self.cfg.num_channels
+        return rearrange(
+            tokens,
+            "B Ct (Np Hp Wp) -> B Np Ct Hp Wp",
+            Np=3,
+            Hp=self.cfg.plane_size,
+            Wp=self.cfg.plane_size,
+        )

TripoSR/tsr/models/transformer/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (15.3 kB). View file

TripoSR/tsr/models/transformer/__pycache__/basic_transformer_block.cpython-310.pyc ADDED Viewed

Binary file (9.61 kB). View file

TripoSR/tsr/models/transformer/__pycache__/transformer_1d.cpython-310.pyc ADDED Viewed

Binary file (4.87 kB). View file

TripoSR/tsr/models/transformer/attention.py ADDED Viewed

	@@ -0,0 +1,653 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# --------
+#
+# Modified 2024 by the Tripo AI and Stability AI Team.
+#
+# Copyright (c) 2024 Tripo AI & Stability AI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.cross_attention_dim = (
+            cross_attention_dim if cross_attention_dim is not None else query_dim
+        )
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(
+                num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True
+            )
+        else:
+            self.group_norm = None
+        self.spatial_norm = None
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels,
+                num_groups=cross_attention_norm_num_groups,
+                eps=1e-5,
+                affine=True,
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+        linear_cls = nn.Linear
+        self.linear_cls = linear_cls
+        self.to_q = linear_cls(query_dim, self.inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+            self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+            self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(linear_cls(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = (
+                AttnProcessor2_0()
+                if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                else AttnProcessor()
+            )
+        self.set_processor(processor)
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        self.processor = processor
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+    def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
+        is the number of heads initialized while constructing the `Attention` class.
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(
+            batch_size // head_size, seq_len, dim * head_size
+        )
+        return tensor
+    def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
+        the number of heads initialized while constructing the `Attention` class.
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
+                reshaped to `[batch_size * heads, seq_len, dim // heads]`.
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def get_attention_scores(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+    ) -> torch.Tensor:
+        r"""
+        Compute the attention scores.
+        Args:
+            query (`torch.Tensor`): The query tensor.
+            key (`torch.Tensor`): The key tensor.
+            attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+        Returns:
+            `torch.Tensor`: The attention probabilities/scores.
+        """
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0],
+                query.shape[1],
+                key.shape[1],
+                dtype=query.dtype,
+                device=query.device,
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+        attention_probs = attention_probs.to(dtype)
+        return attention_probs
+    def prepare_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        target_length: int,
+        batch_size: int,
+        out_dim: int = 3,
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (
+                    attention_mask.shape[0],
+                    attention_mask.shape[1],
+                    target_length,
+                )
+                padding = torch.zeros(
+                    padding_shape,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def norm_encoder_hidden_states(
+        self, encoder_hidden_states: torch.Tensor
+    ) -> torch.Tensor:
+        r"""
+        Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
+        `Attention` class.
+        Args:
+            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
+        Returns:
+            `torch.Tensor`: The normalized encoder hidden states.
+        """
+        assert (
+            self.norm_cross is not None
+        ), "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+        return encoder_hidden_states
+    @torch.no_grad()
+    def fuse_projections(self, fuse=True):
+        is_cross_attention = self.cross_attention_dim != self.query_dim
+        device = self.to_q.weight.data.device
+        dtype = self.to_q.weight.data.dtype
+        if not is_cross_attention:
+            # fetch weight matrices.
+            concatenated_weights = torch.cat(
+                [self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data]
+            )
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+            # create a new single projection layer and copy over the weights.
+            self.to_qkv = self.linear_cls(
+                in_features, out_features, bias=False, device=device, dtype=dtype
+            )
+            self.to_qkv.weight.copy_(concatenated_weights)
+        else:
+            concatenated_weights = torch.cat(
+                [self.to_k.weight.data, self.to_v.weight.data]
+            )
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+            self.to_kv = self.linear_cls(
+                in_features, out_features, bias=False, device=device, dtype=dtype
+            )
+            self.to_kv.weight.copy_(concatenated_weights)
+        self.fused_projections = fuse
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

TripoSR/tsr/models/transformer/basic_transformer_block.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# --------
+#
+# Modified 2024 by the Tripo AI and Stability AI Team.
+#
+# Copyright (c) 2024 Tripo AI & Stability AI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .attention import Attention
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        assert norm_type == "layer_norm"
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=(
+                    cross_attention_dim if not double_self_attention else None
+                ),
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+        )
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=(
+                encoder_hidden_states if self.only_cross_attention else None
+            ),
+            attention_mask=attention_mask,
+        )
+        hidden_states = attn_output + hidden_states
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [
+                    self.ff(hid_slice)
+                    for hid_slice in norm_hidden_states.chunk(
+                        num_chunks, dim=self._chunk_dim
+                    )
+                ],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = nn.Linear
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+    """
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(
+            dtype=gate.dtype
+        )
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        linear_cls = nn.Linear
+        self.proj = linear_cls(dim_in, dim_out * 2)
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states, scale: float = 1.0):
+        args = ()
+        hidden_states, gate = self.proj(hidden_states, *args).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    r"""
+    The approximate form of Gaussian Error Linear Unit (GELU). For more details, see section 2:
+    https://arxiv.org/abs/1606.08415.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)

TripoSR/tsr/models/transformer/transformer_1d.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# --------
+#
+# Modified 2024 by the Tripo AI and Stability AI Team.
+#
+# Copyright (c) 2024 Tripo AI & Stability AI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ...utils import BaseModule
+from .basic_transformer_block import BasicTransformerBlock
+class Transformer1D(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        num_attention_heads: int = 16
+        attention_head_dim: int = 88
+        in_channels: Optional[int] = None
+        out_channels: Optional[int] = None
+        num_layers: int = 1
+        dropout: float = 0.0
+        norm_num_groups: int = 32
+        cross_attention_dim: Optional[int] = None
+        attention_bias: bool = False
+        activation_fn: str = "geglu"
+        only_cross_attention: bool = False
+        double_self_attention: bool = False
+        upcast_attention: bool = False
+        norm_type: str = "layer_norm"
+        norm_elementwise_affine: bool = True
+        gradient_checkpointing: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        self.num_attention_heads = self.cfg.num_attention_heads
+        self.attention_head_dim = self.cfg.attention_head_dim
+        inner_dim = self.num_attention_heads * self.attention_head_dim
+        linear_cls = nn.Linear
+        # 2. Define input layers
+        self.in_channels = self.cfg.in_channels
+        self.norm = torch.nn.GroupNorm(
+            num_groups=self.cfg.norm_num_groups,
+            num_channels=self.cfg.in_channels,
+            eps=1e-6,
+            affine=True,
+        )
+        self.proj_in = linear_cls(self.cfg.in_channels, inner_dim)
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    self.num_attention_heads,
+                    self.attention_head_dim,
+                    dropout=self.cfg.dropout,
+                    cross_attention_dim=self.cfg.cross_attention_dim,
+                    activation_fn=self.cfg.activation_fn,
+                    attention_bias=self.cfg.attention_bias,
+                    only_cross_attention=self.cfg.only_cross_attention,
+                    double_self_attention=self.cfg.double_self_attention,
+                    upcast_attention=self.cfg.upcast_attention,
+                    norm_type=self.cfg.norm_type,
+                    norm_elementwise_affine=self.cfg.norm_elementwise_affine,
+                )
+                for d in range(self.cfg.num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = (
+            self.cfg.in_channels
+            if self.cfg.out_channels is None
+            else self.cfg.out_channels
+        )
+        self.proj_out = linear_cls(inner_dim, self.cfg.in_channels)
+        self.gradient_checkpointing = self.cfg.gradient_checkpointing
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        The [`Transformer1DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+        Returns:
+            torch.FloatTensor
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (
+                1 - encoder_attention_mask.to(hidden_states.dtype)
+            ) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 1. Input
+        batch, _, seq_len = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 1).reshape(
+            batch, seq_len, inner_dim
+        )
+        hidden_states = self.proj_in(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = (
+            hidden_states.reshape(batch, seq_len, inner_dim)
+            .permute(0, 2, 1)
+            .contiguous()
+        )
+        output = hidden_states + residual
+        return output

TripoSR/tsr/system.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import math
+import os
+from dataclasses import dataclass, field
+from typing import List, Union
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+import trimesh
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from PIL import Image
+from .models.isosurface import MarchingCubeHelper
+from .utils import (
+    BaseModule,
+    ImagePreprocessor,
+    find_class,
+    get_spherical_cameras,
+    scale_tensor,
+)
+class TSR(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        cond_image_size: int
+        image_tokenizer_cls: str
+        image_tokenizer: dict
+        tokenizer_cls: str
+        tokenizer: dict
+        backbone_cls: str
+        backbone: dict
+        post_processor_cls: str
+        post_processor: dict
+        decoder_cls: str
+        decoder: dict
+        renderer_cls: str
+        renderer: dict
+    cfg: Config
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, config_name: str, weight_name: str
+    ):
+        if os.path.isdir(pretrained_model_name_or_path):
+            config_path = os.path.join(pretrained_model_name_or_path, config_name)
+            weight_path = os.path.join(pretrained_model_name_or_path, weight_name)
+        else:
+            config_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=config_name
+            )
+            weight_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=weight_name
+            )
+        cfg = OmegaConf.load(config_path)
+        OmegaConf.resolve(cfg)
+        model = cls(cfg)
+        ckpt = torch.load(weight_path, map_location="cpu")
+        model.load_state_dict(ckpt)
+        return model
+    def configure(self):
+        self.image_tokenizer = find_class(self.cfg.image_tokenizer_cls)(
+            self.cfg.image_tokenizer
+        )
+        self.tokenizer = find_class(self.cfg.tokenizer_cls)(self.cfg.tokenizer)
+        self.backbone = find_class(self.cfg.backbone_cls)(self.cfg.backbone)
+        self.post_processor = find_class(self.cfg.post_processor_cls)(
+            self.cfg.post_processor
+        )
+        self.decoder = find_class(self.cfg.decoder_cls)(self.cfg.decoder)
+        self.renderer = find_class(self.cfg.renderer_cls)(self.cfg.renderer)
+        self.image_processor = ImagePreprocessor()
+        self.isosurface_helper = None
+    def forward(
+        self,
+        image: Union[
+            PIL.Image.Image,
+            np.ndarray,
+            torch.FloatTensor,
+            List[PIL.Image.Image],
+            List[np.ndarray],
+            List[torch.FloatTensor],
+        ],
+        device: str,
+    ) -> torch.FloatTensor:
+        rgb_cond = self.image_processor(image, self.cfg.cond_image_size)[:, None].to(
+            device
+        )
+        batch_size = rgb_cond.shape[0]
+        input_image_tokens: torch.Tensor = self.image_tokenizer(
+            rearrange(rgb_cond, "B Nv H W C -> B Nv C H W", Nv=1),
+        )
+        input_image_tokens = rearrange(
+            input_image_tokens, "B Nv C Nt -> B (Nv Nt) C", Nv=1
+        )
+        tokens: torch.Tensor = self.tokenizer(batch_size)
+        tokens = self.backbone(
+            tokens,
+            encoder_hidden_states=input_image_tokens,
+        )
+        scene_codes = self.post_processor(self.tokenizer.detokenize(tokens))
+        return scene_codes
+    def render(
+        self,
+        scene_codes,
+        n_views: int,
+        elevation_deg: float = 0.0,
+        camera_distance: float = 1.9,
+        fovy_deg: float = 40.0,
+        height: int = 256,
+        width: int = 256,
+        return_type: str = "pil",
+    ):
+        rays_o, rays_d = get_spherical_cameras(
+            n_views, elevation_deg, camera_distance, fovy_deg, height, width
+        )
+        rays_o, rays_d = rays_o.to(scene_codes.device), rays_d.to(scene_codes.device)
+        def process_output(image: torch.FloatTensor):
+            if return_type == "pt":
+                return image
+            elif return_type == "np":
+                return image.detach().cpu().numpy()
+            elif return_type == "pil":
+                return Image.fromarray(
+                    (image.detach().cpu().numpy() * 255.0).astype(np.uint8)
+                )
+            else:
+                raise NotImplementedError
+        images = []
+        for scene_code in scene_codes:
+            images_ = []
+            for i in range(n_views):
+                with torch.no_grad():
+                    image = self.renderer(
+                        self.decoder, scene_code, rays_o[i], rays_d[i]
+                    )
+                images_.append(process_output(image))
+            images.append(images_)
+        return images
+    def set_marching_cubes_resolution(self, resolution: int):
+        if (
+            self.isosurface_helper is not None
+            and self.isosurface_helper.resolution == resolution
+        ):
+            return
+        self.isosurface_helper = MarchingCubeHelper(resolution)
+    def extract_mesh(self, scene_codes, resolution: int = 256, threshold: float = 25.0):
+        self.set_marching_cubes_resolution(resolution)
+        meshes = []
+        for scene_code in scene_codes:
+            with torch.no_grad():
+                density = self.renderer.query_triplane(
+                    self.decoder,
+                    scale_tensor(
+                        self.isosurface_helper.grid_vertices.to(scene_codes.device),
+                        self.isosurface_helper.points_range,
+                        (-self.renderer.cfg.radius, self.renderer.cfg.radius),
+                    ),
+                    scene_code,
+                )["density_act"]
+            v_pos, t_pos_idx = self.isosurface_helper(-(density - threshold))
+            v_pos = scale_tensor(
+                v_pos,
+                self.isosurface_helper.points_range,
+                (-self.renderer.cfg.radius, self.renderer.cfg.radius),
+            )
+            with torch.no_grad():
+                color = self.renderer.query_triplane(
+                    self.decoder,
+                    v_pos,
+                    scene_code,
+                )["color"]
+            mesh = trimesh.Trimesh(
+                vertices=v_pos.cpu().numpy(),
+                faces=t_pos_idx.cpu().numpy(),
+                vertex_colors=color.cpu().numpy(),
+            )
+            meshes.append(mesh)
+        return meshes