stable-dreamfusion

Runtime error

App Files Files Community

akhaliq HF staff

ashawkey commited on Dec 3, 2022

Commit

363b2a6

•

0 Parent(s):

Duplicate from ashawkey/stable-dreamfusion

Browse files

Co-authored-by: ashawkey <ashawkey@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +31 -0
LICENSE +201 -0
README.md +14 -0
activation.py +18 -0
app.py +227 -0
assets/update_logs.md +9 -0
docker/Dockerfile +53 -0
docker/README.md +80 -0
encoding.py +33 -0
freqencoder/__init__.py +1 -0
freqencoder/backend.py +41 -0
freqencoder/freq.py +77 -0
freqencoder/setup.py +51 -0
freqencoder/src/bindings.cpp +8 -0
freqencoder/src/freqencoder.cu +129 -0
freqencoder/src/freqencoder.h +10 -0
gridencoder/__init__.py +1 -0
gridencoder/backend.py +40 -0
gridencoder/grid.py +154 -0
gridencoder/setup.py +50 -0
gridencoder/src/bindings.cpp +8 -0
gridencoder/src/gridencoder.cu +479 -0
gridencoder/src/gridencoder.h +15 -0
main.py +160 -0
nerf/clip.py +45 -0
nerf/gui.py +465 -0
nerf/network.py +174 -0
nerf/network_grid.py +181 -0
nerf/network_tcnn.py +174 -0
nerf/provider.py +214 -0
nerf/renderer.py +645 -0
nerf/sd.py +203 -0
nerf/utils.py +950 -0
optimizer.py +470 -0
raymarching/__init__.py +1 -0
raymarching/backend.py +40 -0
raymarching/raymarching.py +373 -0
raymarching/setup.py +62 -0
raymarching/src/bindings.cpp +19 -0
raymarching/src/raymarching.cu +914 -0
raymarching/src/raymarching.h +18 -0
requirements.txt +21 -0
scripts/install_ext.sh +4 -0
scripts/run.sh +5 -0
shencoder/__init__.py +1 -0
shencoder/backend.py +40 -0
shencoder/setup.py +50 -0
shencoder/sphere_harmonics.py +87 -0
shencoder/src/bindings.cpp +8 -0
shencoder/src/shencoder.cu +439 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,31 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Stable Dreamfusion
+emoji: 🍍
+colorFrom: gray
+colorTo: indigo
+sdk: gradio
+sdk_version: 3.5
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: ashawkey/stable-dreamfusion
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

activation.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+from torch.autograd import Function
+from torch.cuda.amp import custom_bwd, custom_fwd
+class _trunc_exp(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float)
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return torch.exp(x)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, g):
+        x = ctx.saved_tensors[0]
+        return g * torch.exp(x.clamp(-15, 15))
+trunc_exp = _trunc_exp.apply

app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import torch
+import argparse
+from nerf.provider import NeRFDataset
+from nerf.utils import *
+import gradio as gr
+import gc
+print(f'[INFO] loading options..')
+# fake config object, this should not be used in CMD, only allow change from gradio UI.
+parser = argparse.ArgumentParser()
+parser.add_argument('--text', default=None, help="text prompt")
+# parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --dir_text")
+# parser.add_argument('-O2', action='store_true', help="equals --fp16 --dir_text")
+parser.add_argument('--test', action='store_true', help="test mode")
+parser.add_argument('--save_mesh', action='store_true', help="export an obj mesh with texture")
+parser.add_argument('--eval_interval', type=int, default=10, help="evaluate on the valid set every interval epochs")
+parser.add_argument('--workspace', type=str, default='trial_gradio')
+parser.add_argument('--guidance', type=str, default='stable-diffusion', help='choose from [stable-diffusion, clip]')
+parser.add_argument('--seed', type=int, default=0)
+### training options
+parser.add_argument('--iters', type=int, default=10000, help="training iters")
+parser.add_argument('--lr', type=float, default=1e-3, help="initial learning rate")
+parser.add_argument('--ckpt', type=str, default='latest')
+parser.add_argument('--cuda_ray', action='store_true', help="use CUDA raymarching instead of pytorch")
+parser.add_argument('--max_steps', type=int, default=1024, help="max num steps sampled per ray (only valid when using --cuda_ray)")
+parser.add_argument('--num_steps', type=int, default=64, help="num steps sampled per ray (only valid when not using --cuda_ray)")
+parser.add_argument('--upsample_steps', type=int, default=64, help="num steps up-sampled per ray (only valid when not using --cuda_ray)")
+parser.add_argument('--update_extra_interval', type=int, default=16, help="iter interval to update extra status (only valid when using --cuda_ray)")
+parser.add_argument('--max_ray_batch', type=int, default=4096, help="batch size of rays at inference to avoid OOM (only valid when not using --cuda_ray)")
+parser.add_argument('--albedo_iters', type=int, default=1000, help="training iters that only use albedo shading")
+# model options
+parser.add_argument('--bg_radius', type=float, default=1.4, help="if positive, use a background model at sphere(bg_radius)")
+parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied")
+# network backbone
+parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
+parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, tcnn, vanilla]")
+# rendering resolution in training, decrease this if CUDA OOM.
+parser.add_argument('--w', type=int, default=64, help="render width for NeRF in training")
+parser.add_argument('--h', type=int, default=64, help="render height for NeRF in training")
+parser.add_argument('--jitter_pose', action='store_true', help="add jitters to the randomly sampled camera poses")
+### dataset options
+parser.add_argument('--bound', type=float, default=1, help="assume the scene is bounded in box(-bound, bound)")
+parser.add_argument('--dt_gamma', type=float, default=0, help="dt_gamma (>=0) for adaptive ray marching. set to 0 to disable, >0 to accelerate rendering (but usually with worse quality)")
+parser.add_argument('--min_near', type=float, default=0.1, help="minimum near distance for camera")
+parser.add_argument('--radius_range', type=float, nargs='*', default=[1.0, 1.5], help="training camera radius range")
+parser.add_argument('--fovy_range', type=float, nargs='*', default=[40, 70], help="training camera fovy range")
+parser.add_argument('--dir_text', action='store_true', help="direction-encode the text prompt, by appending front/side/back/overhead view")
+parser.add_argument('--angle_overhead', type=float, default=30, help="[0, angle_overhead] is the overhead region")
+parser.add_argument('--angle_front', type=float, default=60, help="[0, angle_front] is the front region, [180, 180+angle_front] the back region, otherwise the side region.")
+parser.add_argument('--lambda_entropy', type=float, default=1e-4, help="loss scale for alpha entropy")
+parser.add_argument('--lambda_opacity', type=float, default=0, help="loss scale for alpha value")
+parser.add_argument('--lambda_orient', type=float, default=1e-2, help="loss scale for orientation")
+### GUI options
+parser.add_argument('--gui', action='store_true', help="start a GUI")
+parser.add_argument('--W', type=int, default=800, help="GUI width")
+parser.add_argument('--H', type=int, default=800, help="GUI height")
+parser.add_argument('--radius', type=float, default=3, help="default GUI camera radius from center")
+parser.add_argument('--fovy', type=float, default=60, help="default GUI camera fovy")
+parser.add_argument('--light_theta', type=float, default=60, help="default GUI light direction in [0, 180], corresponding to elevation [90, -90]")
+parser.add_argument('--light_phi', type=float, default=0, help="default GUI light direction in [0, 360), azimuth")
+parser.add_argument('--max_spp', type=int, default=1, help="GUI rendering max sample per pixel")
+opt = parser.parse_args()
+# default to use -O !!!
+opt.fp16 = True
+opt.dir_text = True
+opt.cuda_ray = True
+# opt.lambda_entropy = 1e-4
+# opt.lambda_opacity = 0
+if opt.backbone == 'vanilla':
+    from nerf.network import NeRFNetwork
+elif opt.backbone == 'tcnn':
+    from nerf.network_tcnn import NeRFNetwork
+elif opt.backbone == 'grid':
+    from nerf.network_grid import NeRFNetwork
+else:
+    raise NotImplementedError(f'--backbone {opt.backbone} is not implemented!')
+print(opt)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f'[INFO] loading models..')
+if opt.guidance == 'stable-diffusion':
+    from nerf.sd import StableDiffusion
+    guidance = StableDiffusion(device)
+elif opt.guidance == 'clip':
+    from nerf.clip import CLIP
+    guidance = CLIP(device)
+else:
+    raise NotImplementedError(f'--guidance {opt.guidance} is not implemented.')
+train_loader = NeRFDataset(opt, device=device, type='train', H=opt.h, W=opt.w, size=100).dataloader()
+valid_loader = NeRFDataset(opt, device=device, type='val', H=opt.H, W=opt.W, size=5).dataloader()
+test_loader = NeRFDataset(opt, device=device, type='test', H=opt.H, W=opt.W, size=100).dataloader()
+print(f'[INFO] everything loaded!')
+trainer = None
+model = None
+# define UI
+with gr.Blocks(css=".gradio-container {max-width: 512px; margin: auto;}") as demo:
+    # title
+    gr.Markdown('[Stable-DreamFusion](https://github.com/ashawkey/stable-dreamfusion) Text-to-3D Example')
+    # inputs
+    prompt = gr.Textbox(label="Prompt", max_lines=1, value="a DSLR photo of a koi fish")
+    iters = gr.Slider(label="Iters", minimum=1000, maximum=20000, value=5000, step=100)
+    seed = gr.Slider(label="Seed", minimum=0, maximum=2147483647, step=1, randomize=True)
+    button = gr.Button('Generate')
+    # outputs
+    image = gr.Image(label="image", visible=True)
+    video = gr.Video(label="video", visible=False)
+    logs = gr.Textbox(label="logging")
+    # gradio main func
+    def submit(text, iters, seed):
+        global trainer, model
+        # seed
+        opt.seed = seed
+        opt.text = text
+        opt.iters = iters
+        seed_everything(seed)
+        # clean up
+        if trainer is not None:
+            del model
+            del trainer
+            gc.collect()
+            torch.cuda.empty_cache()
+            print('[INFO] clean up!')
+        # simply reload everything...
+        model = NeRFNetwork(opt)
+        optimizer = lambda model: torch.optim.Adam(model.get_params(opt.lr), betas=(0.9, 0.99), eps=1e-15)
+        scheduler = lambda optimizer: optim.lr_scheduler.LambdaLR(optimizer, lambda iter: 0.1 ** min(iter / opt.iters, 1))
+        trainer = Trainer('df', opt, model, guidance, device=device, workspace=opt.workspace, optimizer=optimizer, ema_decay=0.95, fp16=opt.fp16, lr_scheduler=scheduler, use_checkpoint=opt.ckpt, eval_interval=opt.eval_interval, scheduler_update_every_step=True)
+        # train (every ep only contain 8 steps, so we can get some vis every ~10s)
+        STEPS = 8
+        max_epochs = np.ceil(opt.iters / STEPS).astype(np.int32)
+        # we have to get the explicit training loop out here to yield progressive results...
+        loader = iter(valid_loader)
+        start_t = time.time()
+        for epoch in range(max_epochs):
+            trainer.train_gui(train_loader, step=STEPS)
+            # manual test and get intermediate results
+            try:
+                data = next(loader)
+            except StopIteration:
+                loader = iter(valid_loader)
+                data = next(loader)
+            trainer.model.eval()
+            if trainer.ema is not None:
+                trainer.ema.store()
+                trainer.ema.copy_to()
+            with torch.no_grad():
+                with torch.cuda.amp.autocast(enabled=trainer.fp16):
+                    preds, preds_depth = trainer.test_step(data, perturb=False)
+            if trainer.ema is not None:
+                trainer.ema.restore()
+            pred = preds[0].detach().cpu().numpy()
+            # pred_depth = preds_depth[0].detach().cpu().numpy()
+            pred = (pred * 255).astype(np.uint8)
+            yield {
+                image: gr.update(value=pred, visible=True),
+                video: gr.update(visible=False),
+                logs: f"training iters: {epoch * STEPS} / {iters}, lr: {trainer.optimizer.param_groups[0]['lr']:.6f}",
+            }
+        # test
+        trainer.test(test_loader)
+        results = glob.glob(os.path.join(opt.workspace, 'results', '*rgb*.mp4'))
+        assert results is not None, "cannot retrieve results!"
+        results.sort(key=lambda x: os.path.getmtime(x)) # sort by mtime
+        end_t = time.time()
+        yield {
+            image: gr.update(visible=False),
+            video: gr.update(value=results[-1], visible=True),
+            logs: f"Generation Finished in {(end_t - start_t)/ 60:.4f} minutes!",
+        }
+    button.click(
+        submit,
+        [prompt, iters, seed],
+        [image, video, logs]
+    )
+# concurrency_count: only allow ONE running progress, else GPU will OOM.
+demo.queue(concurrency_count=1)
+demo.launch()

assets/update_logs.md ADDED Viewed

	@@ -0,0 +1,9 @@

+### 2022.10.9
+* The shading (partially) starts to work, at least it won't make scene empty. For some prompts, it shows better results (less severe Janus problem). The textureless rendering mode is still disabled.
+* Enable shading by default (--albedo_iters 1000).
+### 2022.10.5
+* Basic reproduction finished.
+* Non --cuda_ray, --tcnn are not working, need to fix.
+* Shading is not working, disabled in utils.py for now. Surface normals are bad.
+* Use an entropy loss to regularize weights_sum (alpha), the original L2 reg always leads to degenerated geometry...

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,53 @@

+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+# Remove any third-party apt sources to avoid issues with expiring keys.
+RUN rm -f /etc/apt/sources.list.d/*.list
+RUN apt-get update
+RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/MADRID apt-get install -y tzdata
+# Install some basic utilities
+RUN apt-get install -y \
+    curl \
+    ca-certificates \
+    sudo \
+    git \
+    bzip2 \
+    libx11-6 \
+    python3 \
+    python3-pip \
+    libglfw3-dev \
+    libgles2-mesa-dev \
+    libglib2.0-0 \
+ && rm -rf /var/lib/apt/lists/*
+# Create a working directory
+RUN mkdir /app
+WORKDIR /app
+RUN cd /app
+RUN git clone https://github.com/ashawkey/stable-dreamfusion.git
+RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
+WORKDIR /app/stable-dreamfusion
+RUN pip3 install -r requirements.txt
+RUN pip3 install git+https://github.com/NVlabs/nvdiffrast/
+# Needs nvidia runtime, if you have "No CUDA runtime is found" error: https://stackoverflow.com/questions/59691207/docker-build-with-nvidia-runtime, first answer
+RUN pip3 install git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
+RUN pip3 install git+https://github.com/openai/CLIP.git
+RUN bash scripts/install_ext.sh
+# Set the default command to python3
+#CMD ["python3"]

docker/README.md ADDED Viewed

	@@ -0,0 +1,80 @@

+### Docker installation
+## Build image
+To build the docker image on your own machine, which may take 15-30 mins:
+```
+docker build -t stable-dreamfusion:latest .
+```
+If you have the error **No CUDA runtime is found** when building the wheels for tiny-cuda-nn you need to setup the nvidia-runtime for docker.
+```
+sudo apt-get install nvidia-container-runtime
+```
+Then edit `/etc/docker/daemon.json` and add the default-runtime:
+```
+{
+    "runtimes": {
+        "nvidia": {
+            "path": "nvidia-container-runtime",
+            "runtimeArgs": []
+        }
+    },
+    "default-runtime": "nvidia"
+}
+```
+And restart docker:
+```
+sudo systemctl restart docker
+```
+Now you can build tiny-cuda-nn inside docker.
+## Download image
+To download the image (~6GB) instead:
+```
+docker pull supercabb/stable-dreamfusion:3080_0.0.1
+docker tag supercabb/stable-dreamfusion:3080_0.0.1 stable-dreamfusion
+```
+## Use image
+You can launch an interactive shell inside the container:
+```
+docker run --gpus all -it --rm -v $(cd ~ && pwd):/mnt stable-dreamfusion /bin/bash
+```
+From this shell, all the code in the repo should work.
+To run any single command `<command...>` inside the docker container:
+```
+docker run --gpus all -it --rm -v $(cd ~ && pwd):/mnt stable-dreamfusion /bin/bash -c "<command...>"
+```
+To train:
+```
+export TOKEN="#HUGGING FACE ACCESS TOKEN#"
+docker run --gpus all -it --rm -v $(cd ~ && pwd):/mnt stable-dreamfusion /bin/bash -c "echo ${TOKEN} > TOKEN \
+&& python3 main.py --text \"a hamburger\" --workspace trial -O"
+```
+Run test without gui:
+```
+export PATH_TO_WORKSPACE="#PATH_TO_WORKSPACE#"
+docker run --gpus all -it --rm -e DISPLAY=$DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix:ro -v $(cd ~ && pwd):/mnt \
+-v $(cd ${PATH_TO_WORKSPACE} && pwd):/app/stable-dreamfusion/trial stable-dreamfusion /bin/bash -c "python3 \
+main.py --workspace trial -O --test"
+```
+Run test with gui:
+```
+export PATH_TO_WORKSPACE="#PATH_TO_WORKSPACE#"
+xhost +
+docker run --gpus all -it --rm -e DISPLAY=$DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix:ro -v $(cd ~ && pwd):/mnt \
+-v $(cd ${PATH_TO_WORKSPACE} && pwd):/app/stable-dreamfusion/trial stable-dreamfusion /bin/bash -c "python3 \
+main.py --workspace trial -O --test --gui"
+xhost -
+```

encoding.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def get_encoder(encoding, input_dim=3,
+                multires=6,
+                degree=4,
+                num_levels=16, level_dim=2, base_resolution=16, log2_hashmap_size=19, desired_resolution=2048, align_corners=False,
+                **kwargs):
+    if encoding == 'None':
+        return lambda x, **kwargs: x, input_dim
+    elif encoding == 'frequency':
+        from freqencoder import FreqEncoder
+        encoder = FreqEncoder(input_dim=input_dim, degree=multires)
+    elif encoding == 'sphere_harmonics':
+        from shencoder import SHEncoder
+        encoder = SHEncoder(input_dim=input_dim, degree=degree)
+    elif encoding == 'hashgrid':
+        from gridencoder import GridEncoder
+        encoder = GridEncoder(input_dim=input_dim, num_levels=num_levels, level_dim=level_dim, base_resolution=base_resolution, log2_hashmap_size=log2_hashmap_size, desired_resolution=desired_resolution, gridtype='hash', align_corners=align_corners)
+    elif encoding == 'tiledgrid':
+        from gridencoder import GridEncoder
+        encoder = GridEncoder(input_dim=input_dim, num_levels=num_levels, level_dim=level_dim, base_resolution=base_resolution, log2_hashmap_size=log2_hashmap_size, desired_resolution=desired_resolution, gridtype='tiled', align_corners=align_corners)
+    else:
+        raise NotImplementedError('Unknown encoding mode, choose from [None, frequency, sphere_harmonics, hashgrid, tiledgrid]')
+    return encoder, encoder.output_dim

freqencoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .freq import FreqEncoder

freqencoder/backend.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+from torch.utils.cpp_extension import load
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+    '-use_fast_math'
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+_backend = load(name='_freqencoder',
+                extra_cflags=c_flags,
+                extra_cuda_cflags=nvcc_flags,
+                sources=[os.path.join(_src_path, 'src', f) for f in [
+                    'freqencoder.cu',
+                    'bindings.cpp',
+                ]],
+                )
+__all__ = ['_backend']

freqencoder/freq.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.cuda.amp import custom_bwd, custom_fwd
+try:
+    import _freqencoder as _backend
+except ImportError:
+    from .backend import _backend
+class _freq_encoder(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32) # force float32 for better precision
+    def forward(ctx, inputs, degree, output_dim):
+        # inputs: [B, input_dim], float
+        # RETURN: [B, F], float
+        if not inputs.is_cuda: inputs = inputs.cuda()
+        inputs = inputs.contiguous()
+        B, input_dim = inputs.shape # batch size, coord dim
+        outputs = torch.empty(B, output_dim, dtype=inputs.dtype, device=inputs.device)
+        _backend.freq_encode_forward(inputs, B, input_dim, degree, output_dim, outputs)
+        ctx.save_for_backward(inputs, outputs)
+        ctx.dims = [B, input_dim, degree, output_dim]
+        return outputs
+    @staticmethod
+    #@once_differentiable
+    @custom_bwd
+    def backward(ctx, grad):
+        # grad: [B, C * C]
+        grad = grad.contiguous()
+        inputs, outputs = ctx.saved_tensors
+        B, input_dim, degree, output_dim = ctx.dims
+        grad_inputs = torch.zeros_like(inputs)
+        _backend.freq_encode_backward(grad, outputs, B, input_dim, degree, output_dim, grad_inputs)
+        return grad_inputs, None, None
+freq_encode = _freq_encoder.apply
+class FreqEncoder(nn.Module):
+    def __init__(self, input_dim=3, degree=4):
+        super().__init__()
+        self.input_dim = input_dim
+        self.degree = degree
+        self.output_dim = input_dim + input_dim * 2 * degree
+    def __repr__(self):
+        return f"FreqEncoder: input_dim={self.input_dim} degree={self.degree} output_dim={self.output_dim}"
+    def forward(self, inputs, **kwargs):
+        # inputs: [..., input_dim]
+        # return: [..., ]
+        prefix_shape = list(inputs.shape[:-1])
+        inputs = inputs.reshape(-1, self.input_dim)
+        outputs = freq_encode(inputs, self.degree, self.output_dim)
+        outputs = outputs.reshape(prefix_shape + [self.output_dim])
+        return outputs

freqencoder/setup.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+    '-use_fast_math'
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+setup(
+    name='freqencoder', # package name, import this to use python API
+    ext_modules=[
+        CUDAExtension(
+            name='_freqencoder', # extension name, import this to use CUDA API
+            sources=[os.path.join(_src_path, 'src', f) for f in [
+                'freqencoder.cu',
+                'bindings.cpp',
+            ]],
+            extra_compile_args={
+                'cxx': c_flags,
+                'nvcc': nvcc_flags,
+            }
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension,
+    }
+)

freqencoder/src/bindings.cpp ADDED Viewed

	@@ -0,0 +1,8 @@

+#include <torch/extension.h>
+#include "freqencoder.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("freq_encode_forward", &freq_encode_forward, "freq encode forward (CUDA)");
+    m.def("freq_encode_backward", &freq_encode_backward, "freq encode backward (CUDA)");
+}

freqencoder/src/freqencoder.cu ADDED Viewed

	@@ -0,0 +1,129 @@

+#include <stdint.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/torch.h>
+#include <algorithm>
+#include <stdexcept>
+#include <cstdio>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor")
+#define CHECK_IS_FLOATING(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float || x.scalar_type() == at::ScalarType::Half || x.scalar_type() == at::ScalarType::Double, #x " must be a floating tensor")
+inline constexpr __device__ float PI() { return 3.141592653589793f; }
+template <typename T>
+__host__ __device__ T div_round_up(T val, T divisor) {
+    return (val + divisor - 1) / divisor;
+}
+// inputs: [B, D]
+// outputs: [B, C], C = D + D * deg * 2
+__global__ void kernel_freq(
+    const float * __restrict__ inputs,
+    uint32_t B, uint32_t D, uint32_t deg, uint32_t C,
+    float * outputs
+) {
+    // parallel on per-element
+    const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x;
+    if (t >= B * C) return;
+    // get index
+    const uint32_t b = t / C;
+    const uint32_t c = t - b * C; // t % C;
+    // locate
+    inputs += b * D;
+    outputs += t;
+    // write self
+    if (c < D) {
+        outputs[0] = inputs[c];
+    // write freq
+    } else {
+        const uint32_t col = c / D - 1;
+        const uint32_t d = c % D;
+        const uint32_t freq = col / 2;
+        const float phase_shift = (col % 2) * (PI() / 2);
+        outputs[0] = __sinf(scalbnf(inputs[d], freq) + phase_shift);
+    }
+}
+// grad: [B, C], C = D + D * deg * 2
+// outputs: [B, C]
+// grad_inputs: [B, D]
+__global__ void kernel_freq_backward(
+    const float * __restrict__ grad,
+    const float * __restrict__ outputs,
+    uint32_t B, uint32_t D, uint32_t deg, uint32_t C,
+    float * grad_inputs
+) {
+    // parallel on per-element
+    const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x;
+    if (t >= B * D) return;
+    const uint32_t b = t / D;
+    const uint32_t d = t - b * D; // t % D;
+    // locate
+    grad += b * C;
+    outputs += b * C;
+    grad_inputs += t;
+    // register
+    float result = grad[d];
+    grad += D;
+    outputs += D;
+    for (uint32_t f = 0; f < deg; f++) {
+        result += scalbnf(1.0f, f) * (grad[d] * outputs[D + d] - grad[D + d] * outputs[d]);
+        grad += 2 * D;
+        outputs += 2 * D;
+    }
+    // write
+    grad_inputs[0] = result;
+}
+void freq_encode_forward(at::Tensor inputs, const uint32_t B, const uint32_t D, const uint32_t deg, const uint32_t C, at::Tensor outputs) {
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(outputs);
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(outputs);
+    CHECK_IS_FLOATING(inputs);
+    CHECK_IS_FLOATING(outputs);
+    static constexpr uint32_t N_THREADS = 128;
+    kernel_freq<<<div_round_up(B * C, N_THREADS), N_THREADS>>>(inputs.data_ptr<float>(), B, D, deg, C, outputs.data_ptr<float>());
+}
+void freq_encode_backward(at::Tensor grad, at::Tensor outputs, const uint32_t B, const uint32_t D, const uint32_t deg, const uint32_t C, at::Tensor grad_inputs) {
+    CHECK_CUDA(grad);
+    CHECK_CUDA(outputs);
+    CHECK_CUDA(grad_inputs);
+    CHECK_CONTIGUOUS(grad);
+    CHECK_CONTIGUOUS(outputs);
+    CHECK_CONTIGUOUS(grad_inputs);
+    CHECK_IS_FLOATING(grad);
+    CHECK_IS_FLOATING(outputs);
+    CHECK_IS_FLOATING(grad_inputs);
+    static constexpr uint32_t N_THREADS = 128;
+    kernel_freq_backward<<<div_round_up(B * D, N_THREADS), N_THREADS>>>(grad.data_ptr<float>(), outputs.data_ptr<float>(), B, D, deg, C, grad_inputs.data_ptr<float>());
+}

freqencoder/src/freqencoder.h ADDED Viewed

	@@ -0,0 +1,10 @@

+# pragma once
+#include <stdint.h>
+#include <torch/torch.h>
+// _backend.freq_encode_forward(inputs, B, input_dim, degree, output_dim, outputs)
+void freq_encode_forward(at::Tensor inputs, const uint32_t B, const uint32_t D, const uint32_t deg, const uint32_t C, at::Tensor outputs);
+// _backend.freq_encode_backward(grad, outputs, B, input_dim, degree, output_dim, grad_inputs)
+void freq_encode_backward(at::Tensor grad, at::Tensor outputs, const uint32_t B, const uint32_t D, const uint32_t deg, const uint32_t C, at::Tensor grad_inputs);

gridencoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .grid import GridEncoder

gridencoder/backend.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from torch.utils.cpp_extension import load
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+_backend = load(name='_grid_encoder',
+                extra_cflags=c_flags,
+                extra_cuda_cflags=nvcc_flags,
+                sources=[os.path.join(_src_path, 'src', f) for f in [
+                    'gridencoder.cu',
+                    'bindings.cpp',
+                ]],
+                )
+__all__ = ['_backend']

gridencoder/grid.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.cuda.amp import custom_bwd, custom_fwd
+try:
+    import _gridencoder as _backend
+except ImportError:
+    from .backend import _backend
+_gridtype_to_id = {
+    'hash': 0,
+    'tiled': 1,
+}
+class _grid_encode(Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, inputs, embeddings, offsets, per_level_scale, base_resolution, calc_grad_inputs=False, gridtype=0, align_corners=False):
+        # inputs: [B, D], float in [0, 1]
+        # embeddings: [sO, C], float
+        # offsets: [L + 1], int
+        # RETURN: [B, F], float
+        inputs = inputs.contiguous()
+        B, D = inputs.shape # batch size, coord dim
+        L = offsets.shape[0] - 1 # level
+        C = embeddings.shape[1] # embedding dim for each level
+        S = np.log2(per_level_scale) # resolution multiplier at each level, apply log2 for later CUDA exp2f
+        H = base_resolution # base resolution
+        # manually handle autocast (only use half precision embeddings, inputs must be float for enough precision)
+        # if C % 2 != 0, force float, since half for atomicAdd is very slow.
+        if torch.is_autocast_enabled() and C % 2 == 0:
+            embeddings = embeddings.to(torch.half)
+        # L first, optimize cache for cuda kernel, but needs an extra permute later
+        outputs = torch.empty(L, B, C, device=inputs.device, dtype=embeddings.dtype)
+        if calc_grad_inputs:
+            dy_dx = torch.empty(B, L * D * C, device=inputs.device, dtype=embeddings.dtype)
+        else:
+            dy_dx = None
+        _backend.grid_encode_forward(inputs, embeddings, offsets, outputs, B, D, C, L, S, H, dy_dx, gridtype, align_corners)
+        # permute back to [B, L * C]
+        outputs = outputs.permute(1, 0, 2).reshape(B, L * C)
+        ctx.save_for_backward(inputs, embeddings, offsets, dy_dx)
+        ctx.dims = [B, D, C, L, S, H, gridtype]
+        ctx.align_corners = align_corners
+        return outputs
+    @staticmethod
+    #@once_differentiable
+    @custom_bwd
+    def backward(ctx, grad):
+        inputs, embeddings, offsets, dy_dx = ctx.saved_tensors
+        B, D, C, L, S, H, gridtype = ctx.dims
+        align_corners = ctx.align_corners
+        # grad: [B, L * C] --> [L, B, C]
+        grad = grad.view(B, L, C).permute(1, 0, 2).contiguous()
+        grad_embeddings = torch.zeros_like(embeddings)
+        if dy_dx is not None:
+            grad_inputs = torch.zeros_like(inputs, dtype=embeddings.dtype)
+        else:
+            grad_inputs = None
+        _backend.grid_encode_backward(grad, inputs, embeddings, offsets, grad_embeddings, B, D, C, L, S, H, dy_dx, grad_inputs, gridtype, align_corners)
+        if dy_dx is not None:
+            grad_inputs = grad_inputs.to(inputs.dtype)
+        return grad_inputs, grad_embeddings, None, None, None, None, None, None
+grid_encode = _grid_encode.apply
+class GridEncoder(nn.Module):
+    def __init__(self, input_dim=3, num_levels=16, level_dim=2, per_level_scale=2, base_resolution=16, log2_hashmap_size=19, desired_resolution=None, gridtype='hash', align_corners=False):
+        super().__init__()
+        # the finest resolution desired at the last level, if provided, overridee per_level_scale
+        if desired_resolution is not None:
+            per_level_scale = np.exp2(np.log2(desired_resolution / base_resolution) / (num_levels - 1))
+        self.input_dim = input_dim # coord dims, 2 or 3
+        self.num_levels = num_levels # num levels, each level multiply resolution by 2
+        self.level_dim = level_dim # encode channels per level
+        self.per_level_scale = per_level_scale # multiply resolution by this scale at each level.
+        self.log2_hashmap_size = log2_hashmap_size
+        self.base_resolution = base_resolution
+        self.output_dim = num_levels * level_dim
+        self.gridtype = gridtype
+        self.gridtype_id = _gridtype_to_id[gridtype] # "tiled" or "hash"
+        self.align_corners = align_corners
+        # allocate parameters
+        offsets = []
+        offset = 0
+        self.max_params = 2 ** log2_hashmap_size
+        for i in range(num_levels):
+            resolution = int(np.ceil(base_resolution * per_level_scale ** i))
+            params_in_level = min(self.max_params, (resolution if align_corners else resolution + 1) ** input_dim) # limit max number
+            params_in_level = int(np.ceil(params_in_level / 8) * 8) # make divisible
+            offsets.append(offset)
+            offset += params_in_level
+        offsets.append(offset)
+        offsets = torch.from_numpy(np.array(offsets, dtype=np.int32))
+        self.register_buffer('offsets', offsets)
+        self.n_params = offsets[-1] * level_dim
+        # parameters
+        self.embeddings = nn.Parameter(torch.empty(offset, level_dim))
+        self.reset_parameters()
+    def reset_parameters(self):
+        std = 1e-4
+        self.embeddings.data.uniform_(-std, std)
+    def __repr__(self):
+        return f"GridEncoder: input_dim={self.input_dim} num_levels={self.num_levels} level_dim={self.level_dim} resolution={self.base_resolution} -> {int(round(self.base_resolution * self.per_level_scale ** (self.num_levels - 1)))} per_level_scale={self.per_level_scale:.4f} params={tuple(self.embeddings.shape)} gridtype={self.gridtype} align_corners={self.align_corners}"
+    def forward(self, inputs, bound=1):
+        # inputs: [..., input_dim], normalized real world positions in [-bound, bound]
+        # return: [..., num_levels * level_dim]
+        inputs = (inputs + bound) / (2 * bound) # map to [0, 1]
+        #print('inputs', inputs.shape, inputs.dtype, inputs.min().item(), inputs.max().item())
+        prefix_shape = list(inputs.shape[:-1])
+        inputs = inputs.view(-1, self.input_dim)
+        outputs = grid_encode(inputs, self.embeddings, self.offsets, self.per_level_scale, self.base_resolution, inputs.requires_grad, self.gridtype_id, self.align_corners)
+        outputs = outputs.view(prefix_shape + [self.output_dim])
+        #print('outputs', outputs.shape, outputs.dtype, outputs.min().item(), outputs.max().item())
+        return outputs

gridencoder/setup.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+setup(
+    name='gridencoder', # package name, import this to use python API
+    ext_modules=[
+        CUDAExtension(
+            name='_gridencoder', # extension name, import this to use CUDA API
+            sources=[os.path.join(_src_path, 'src', f) for f in [
+                'gridencoder.cu',
+                'bindings.cpp',
+            ]],
+            extra_compile_args={
+                'cxx': c_flags,
+                'nvcc': nvcc_flags,
+            }
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension,
+    }
+)

gridencoder/src/bindings.cpp ADDED Viewed

	@@ -0,0 +1,8 @@

+#include <torch/extension.h>
+#include "gridencoder.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("grid_encode_forward", &grid_encode_forward, "grid_encode_forward (CUDA)");
+    m.def("grid_encode_backward", &grid_encode_backward, "grid_encode_backward (CUDA)");
+}

gridencoder/src/gridencoder.cu ADDED Viewed

	@@ -0,0 +1,479 @@

+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/torch.h>
+#include <algorithm>
+#include <stdexcept>
+#include <stdint.h>
+#include <cstdio>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor")
+#define CHECK_IS_FLOATING(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float || x.scalar_type() == at::ScalarType::Half || x.scalar_type() == at::ScalarType::Double, #x " must be a floating tensor")
+// just for compatability of half precision in AT_DISPATCH_FLOATING_TYPES_AND_HALF...
+static inline  __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
+  // requires CUDA >= 10 and ARCH >= 70
+  // this is very slow compared to float or __half2, and never used.
+  //return atomicAdd(reinterpret_cast<__half*>(address), val);
+}
+template <typename T>
+static inline __host__ __device__ T div_round_up(T val, T divisor) {
+    return (val + divisor - 1) / divisor;
+}
+template <uint32_t D>
+__device__ uint32_t fast_hash(const uint32_t pos_grid[D]) {
+    static_assert(D <= 7, "fast_hash can only hash up to 7 dimensions.");
+    // While 1 is technically not a good prime for hashing (or a prime at all), it helps memory coherence
+    // and is sufficient for our use case of obtaining a uniformly colliding index from high-dimensional
+    // coordinates.
+    constexpr uint32_t primes[7] = { 1, 2654435761, 805459861, 3674653429, 2097192037, 1434869437, 2165219737 };
+    uint32_t result = 0;
+    #pragma unroll
+    for (uint32_t i = 0; i < D; ++i) {
+        result ^= pos_grid[i] * primes[i];
+    }
+    return result;
+}
+template <uint32_t D, uint32_t C>
+__device__ uint32_t get_grid_index(const uint32_t gridtype, const bool align_corners, const uint32_t ch, const uint32_t hashmap_size, const uint32_t resolution, const uint32_t pos_grid[D]) {
+    uint32_t stride = 1;
+    uint32_t index = 0;
+    #pragma unroll
+    for (uint32_t d = 0; d < D && stride <= hashmap_size; d++) {
+        index += pos_grid[d] * stride;
+        stride *= align_corners ? resolution: (resolution + 1);
+    }
+    // NOTE: for NeRF, the hash is in fact not necessary. Check https://github.com/NVlabs/instant-ngp/issues/97.
+    // gridtype: 0 == hash, 1 == tiled
+    if (gridtype == 0 && stride > hashmap_size) {
+        index = fast_hash<D>(pos_grid);
+    }
+    return (index % hashmap_size) * C + ch;
+}
+template <typename scalar_t, uint32_t D, uint32_t C>
+__global__ void kernel_grid(
+    const float * __restrict__ inputs,
+    const scalar_t * __restrict__ grid,
+    const int * __restrict__ offsets,
+    scalar_t * __restrict__ outputs,
+    const uint32_t B, const uint32_t L, const float S, const uint32_t H,
+    scalar_t * __restrict__ dy_dx,
+    const uint32_t gridtype,
+    const bool align_corners
+) {
+    const uint32_t b = blockIdx.x * blockDim.x + threadIdx.x;
+    if (b >= B) return;
+    const uint32_t level = blockIdx.y;
+    // locate
+    grid += (uint32_t)offsets[level] * C;
+    inputs += b * D;
+    outputs += level * B * C + b * C;
+    // check input range (should be in [0, 1])
+    bool flag_oob = false;
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        if (inputs[d] < 0 || inputs[d] > 1) {
+            flag_oob = true;
+        }
+    }
+    // if input out of bound, just set output to 0
+    if (flag_oob) {
+        #pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+            outputs[ch] = 0;
+        }
+        if (dy_dx) {
+            dy_dx += b * D * L * C + level * D * C; // B L D C
+            #pragma unroll
+            for (uint32_t d = 0; d < D; d++) {
+                #pragma unroll
+                for (uint32_t ch = 0; ch < C; ch++) {
+                    dy_dx[d * C + ch] = 0;
+                }
+            }
+        }
+        return;
+    }
+    const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+    const float scale = exp2f(level * S) * H - 1.0f;
+    const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+    // calculate coordinate
+    float pos[D];
+    uint32_t pos_grid[D];
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        pos[d] = inputs[d] * scale + (align_corners ? 0.0f : 0.5f);
+        pos_grid[d] = floorf(pos[d]);
+        pos[d] -= (float)pos_grid[d];
+    }
+    //printf("[b=%d, l=%d] pos=(%f, %f)+(%d, %d)\n", b, level, pos[0], pos[1], pos_grid[0], pos_grid[1]);
+    // interpolate
+    scalar_t results[C] = {0}; // temp results in register
+    #pragma unroll
+    for (uint32_t idx = 0; idx < (1 << D); idx++) {
+        float w = 1;
+        uint32_t pos_grid_local[D];
+        #pragma unroll
+        for (uint32_t d = 0; d < D; d++) {
+            if ((idx & (1 << d)) == 0) {
+                w *= 1 - pos[d];
+                pos_grid_local[d] = pos_grid[d];
+            } else {
+                w *= pos[d];
+                pos_grid_local[d] = pos_grid[d] + 1;
+            }
+        }
+        uint32_t index = get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size, resolution, pos_grid_local);
+        // writing to register (fast)
+        #pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+            results[ch] += w * grid[index + ch];
+        }
+        //printf("[b=%d, l=%d] int %d, idx %d, w %f, val %f\n", b, level, idx, index, w, grid[index]);
+    }
+    // writing to global memory (slow)
+    #pragma unroll
+    for (uint32_t ch = 0; ch < C; ch++) {
+        outputs[ch] = results[ch];
+    }
+    // prepare dy_dx
+    // differentiable (soft) indexing: https://discuss.pytorch.org/t/differentiable-indexing/17647/9
+    if (dy_dx) {
+        dy_dx += b * D * L * C + level * D * C; // B L D C
+        #pragma unroll
+        for (uint32_t gd = 0; gd < D; gd++) {
+            scalar_t results_grad[C] = {0};
+            #pragma unroll
+            for (uint32_t idx = 0; idx < (1 << (D - 1)); idx++) {
+                float w = scale;
+                uint32_t pos_grid_local[D];
+                #pragma unroll
+                for (uint32_t nd = 0; nd < D - 1; nd++) {
+                    const uint32_t d = (nd >= gd) ? (nd + 1) : nd;
+                    if ((idx & (1 << nd)) == 0) {
+                        w *= 1 - pos[d];
+                        pos_grid_local[d] = pos_grid[d];
+                    } else {
+                        w *= pos[d];
+                        pos_grid_local[d] = pos_grid[d] + 1;
+                    }
+                }
+                pos_grid_local[gd] = pos_grid[gd];
+                uint32_t index_left = get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size, resolution, pos_grid_local);
+                pos_grid_local[gd] = pos_grid[gd] + 1;
+                uint32_t index_right = get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size, resolution, pos_grid_local);
+                #pragma unroll
+                for (uint32_t ch = 0; ch < C; ch++) {
+                    results_grad[ch] += w * (grid[index_right + ch] - grid[index_left + ch]);
+                }
+            }
+            #pragma unroll
+            for (uint32_t ch = 0; ch < C; ch++) {
+                dy_dx[gd * C + ch] = results_grad[ch];
+            }
+        }
+    }
+}
+template <typename scalar_t, uint32_t D, uint32_t C, uint32_t N_C>
+__global__ void kernel_grid_backward(
+    const scalar_t * __restrict__ grad,
+    const float * __restrict__ inputs,
+    const scalar_t * __restrict__ grid,
+    const int * __restrict__ offsets,
+    scalar_t * __restrict__ grad_grid,
+    const uint32_t B, const uint32_t L, const float S, const uint32_t H,
+    const uint32_t gridtype,
+    const bool align_corners
+) {
+    const uint32_t b = (blockIdx.x * blockDim.x + threadIdx.x) * N_C / C;
+    if (b >= B) return;
+    const uint32_t level = blockIdx.y;
+    const uint32_t ch = (blockIdx.x * blockDim.x + threadIdx.x) * N_C - b * C;
+    // locate
+    grad_grid += offsets[level] * C;
+    inputs += b * D;
+    grad += level * B * C + b * C + ch; // L, B, C
+    const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+    const float scale = exp2f(level * S) * H - 1.0f;
+    const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+    // check input range (should be in [0, 1])
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        if (inputs[d] < 0 || inputs[d] > 1) {
+            return; // grad is init as 0, so we simply return.
+        }
+    }
+    // calculate coordinate
+    float pos[D];
+    uint32_t pos_grid[D];
+    #pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+        pos[d] = inputs[d] * scale + (align_corners ? 0.0f : 0.5f);
+        pos_grid[d] = floorf(pos[d]);
+        pos[d] -= (float)pos_grid[d];
+    }
+    scalar_t grad_cur[N_C] = {0}; // fetch to register
+    #pragma unroll
+    for (uint32_t c = 0; c < N_C; c++) {
+        grad_cur[c] = grad[c];
+    }
+    // interpolate
+    #pragma unroll
+    for (uint32_t idx = 0; idx < (1 << D); idx++) {
+        float w = 1;
+        uint32_t pos_grid_local[D];
+        #pragma unroll
+        for (uint32_t d = 0; d < D; d++) {
+            if ((idx & (1 << d)) == 0) {
+                w *= 1 - pos[d];
+                pos_grid_local[d] = pos_grid[d];
+            } else {
+                w *= pos[d];
+                pos_grid_local[d] = pos_grid[d] + 1;
+            }
+        }
+        uint32_t index = get_grid_index<D, C>(gridtype, align_corners, ch, hashmap_size, resolution, pos_grid_local);
+        // atomicAdd for __half is slow (especially for large values), so we use __half2 if N_C % 2 == 0
+        // TODO: use float which is better than __half, if N_C % 2 != 0
+        if (std::is_same<scalar_t, at::Half>::value && N_C % 2 == 0) {
+            #pragma unroll
+            for (uint32_t c = 0; c < N_C; c += 2) {
+                // process two __half at once (by interpreting as a __half2)
+                __half2 v = {(__half)(w * grad_cur[c]), (__half)(w * grad_cur[c + 1])};
+                atomicAdd((__half2*)&grad_grid[index + c], v);
+            }
+        // float, or __half when N_C % 2 != 0 (which means C == 1)
+        } else {
+            #pragma unroll
+            for (uint32_t c = 0; c < N_C; c++) {
+                atomicAdd(&grad_grid[index + c], w * grad_cur[c]);
+            }
+        }
+    }
+}
+template <typename scalar_t, uint32_t D, uint32_t C>
+__global__ void kernel_input_backward(
+    const scalar_t * __restrict__ grad,
+    const scalar_t * __restrict__ dy_dx,
+    scalar_t * __restrict__ grad_inputs,
+    uint32_t B, uint32_t L
+) {
+    const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x;
+    if (t >= B * D) return;
+    const uint32_t b = t / D;
+    const uint32_t d = t - b * D;
+    dy_dx += b * L * D * C;
+    scalar_t result = 0;
+    # pragma unroll
+    for (int l = 0; l < L; l++) {
+        # pragma unroll
+        for (int ch = 0; ch < C; ch++) {
+            result += grad[l * B * C + b * C + ch] * dy_dx[l * D * C + d * C + ch];
+        }
+    }
+    grad_inputs[t] = result;
+}
+template <typename scalar_t, uint32_t D>
+void kernel_grid_wrapper(const float *inputs, const scalar_t *embeddings, const int *offsets, scalar_t *outputs, const uint32_t B, const uint32_t C, const uint32_t L, const float S, const uint32_t H, scalar_t *dy_dx, const uint32_t gridtype, const bool align_corners) {
+    static constexpr uint32_t N_THREAD = 512;
+    const dim3 blocks_hashgrid = { div_round_up(B, N_THREAD), L, 1 };
+    switch (C) {
+        case 1: kernel_grid<scalar_t, D, 1><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, B, L, S, H, dy_dx, gridtype, align_corners); break;
+        case 2: kernel_grid<scalar_t, D, 2><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, B, L, S, H, dy_dx, gridtype, align_corners); break;
+        case 4: kernel_grid<scalar_t, D, 4><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, B, L, S, H, dy_dx, gridtype, align_corners); break;
+        case 8: kernel_grid<scalar_t, D, 8><<<blocks_hashgrid, N_THREAD>>>(inputs, embeddings, offsets, outputs, B, L, S, H, dy_dx, gridtype, align_corners); break;
+        default: throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+    }
+}
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [L, B, C], float (L first, so only one level of hashmap needs to fit into cache at a time.)
+// H: base resolution
+// dy_dx: [B, L * D * C]
+template <typename scalar_t>
+void grid_encode_forward_cuda(const float *inputs, const scalar_t *embeddings, const int *offsets, scalar_t *outputs, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, scalar_t *dy_dx, const uint32_t gridtype, const bool align_corners) {
+    switch (D) {
+        case 1: kernel_grid_wrapper<scalar_t, 1>(inputs, embeddings, offsets, outputs, B, C, L, S, H, dy_dx, gridtype, align_corners); break;
+        case 2: kernel_grid_wrapper<scalar_t, 2>(inputs, embeddings, offsets, outputs, B, C, L, S, H, dy_dx, gridtype, align_corners); break;
+        case 3: kernel_grid_wrapper<scalar_t, 3>(inputs, embeddings, offsets, outputs, B, C, L, S, H, dy_dx, gridtype, align_corners); break;
+        case 4: kernel_grid_wrapper<scalar_t, 4>(inputs, embeddings, offsets, outputs, B, C, L, S, H, dy_dx, gridtype, align_corners); break;
+        case 5: kernel_grid_wrapper<scalar_t, 5>(inputs, embeddings, offsets, outputs, B, C, L, S, H, dy_dx, gridtype, align_corners); break;
+        default: throw std::runtime_error{"GridEncoding: D must be 1, 2, 3, 4, or 5."};
+    }
+}
+template <typename scalar_t, uint32_t D>
+void kernel_grid_backward_wrapper(const scalar_t *grad, const float *inputs, const scalar_t *embeddings, const int *offsets, scalar_t *grad_embeddings, const uint32_t B, const uint32_t C, const uint32_t L, const float S, const uint32_t H, scalar_t *dy_dx, scalar_t *grad_inputs, const uint32_t gridtype, const bool align_corners) {
+    static constexpr uint32_t N_THREAD = 256;
+    const uint32_t N_C = std::min(2u, C); // n_features_per_thread
+    const dim3 blocks_hashgrid = { div_round_up(B * C / N_C, N_THREAD), L, 1 };
+    switch (C) {
+        case 1:
+            kernel_grid_backward<scalar_t, D, 1, 1><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H, gridtype, align_corners);
+            if (dy_dx) kernel_input_backward<scalar_t, D, 1><<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx, grad_inputs, B, L);
+            break;
+        case 2:
+            kernel_grid_backward<scalar_t, D, 2, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H, gridtype, align_corners);
+            if (dy_dx) kernel_input_backward<scalar_t, D, 2><<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx, grad_inputs, B, L);
+            break;
+        case 4:
+            kernel_grid_backward<scalar_t, D, 4, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H, gridtype, align_corners);
+            if (dy_dx) kernel_input_backward<scalar_t, D, 4><<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx, grad_inputs, B, L);
+            break;
+        case 8:
+            kernel_grid_backward<scalar_t, D, 8, 2><<<blocks_hashgrid, N_THREAD>>>(grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H, gridtype, align_corners);
+            if (dy_dx) kernel_input_backward<scalar_t, D, 8><<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx, grad_inputs, B, L);
+            break;
+        default: throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+    }
+}
+// grad: [L, B, C], float
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// grad_embeddings: [sO, C]
+// H: base resolution
+template <typename scalar_t>
+void grid_encode_backward_cuda(const scalar_t *grad, const float *inputs, const scalar_t *embeddings, const int *offsets, scalar_t *grad_embeddings, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, scalar_t *dy_dx, scalar_t *grad_inputs, const uint32_t gridtype, const bool align_corners) {
+    switch (D) {
+        case 1: kernel_grid_backward_wrapper<scalar_t, 1>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, dy_dx, grad_inputs, gridtype, align_corners); break;
+        case 2: kernel_grid_backward_wrapper<scalar_t, 2>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, dy_dx, grad_inputs, gridtype, align_corners); break;
+        case 3: kernel_grid_backward_wrapper<scalar_t, 3>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, dy_dx, grad_inputs, gridtype, align_corners); break;
+        case 4: kernel_grid_backward_wrapper<scalar_t, 4>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, dy_dx, grad_inputs, gridtype, align_corners); break;
+        case 5: kernel_grid_backward_wrapper<scalar_t, 5>(grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H, dy_dx, grad_inputs, gridtype, align_corners); break;
+        default: throw std::runtime_error{"GridEncoding: D must be 1, 2, 3, 4, or 5."};
+    }
+}
+void grid_encode_forward(const at::Tensor inputs, const at::Tensor embeddings, const at::Tensor offsets, at::Tensor outputs, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, at::optional<at::Tensor> dy_dx, const uint32_t gridtype, const bool align_corners) {
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(embeddings);
+    CHECK_CUDA(offsets);
+    CHECK_CUDA(outputs);
+    // CHECK_CUDA(dy_dx);
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(embeddings);
+    CHECK_CONTIGUOUS(offsets);
+    CHECK_CONTIGUOUS(outputs);
+    // CHECK_CONTIGUOUS(dy_dx);
+    CHECK_IS_FLOATING(inputs);
+    CHECK_IS_FLOATING(embeddings);
+    CHECK_IS_INT(offsets);
+    CHECK_IS_FLOATING(outputs);
+    // CHECK_IS_FLOATING(dy_dx);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    embeddings.scalar_type(), "grid_encode_forward", ([&] {
+        grid_encode_forward_cuda<scalar_t>(inputs.data_ptr<float>(), embeddings.data_ptr<scalar_t>(), offsets.data_ptr<int>(), outputs.data_ptr<scalar_t>(), B, D, C, L, S, H, dy_dx.has_value() ? dy_dx.value().data_ptr<scalar_t>() : nullptr, gridtype, align_corners);
+    }));
+}
+void grid_encode_backward(const at::Tensor grad, const at::Tensor inputs, const at::Tensor embeddings, const at::Tensor offsets, at::Tensor grad_embeddings, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const at::optional<at::Tensor> dy_dx, at::optional<at::Tensor> grad_inputs, const uint32_t gridtype, const bool align_corners) {
+    CHECK_CUDA(grad);
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(embeddings);
+    CHECK_CUDA(offsets);
+    CHECK_CUDA(grad_embeddings);
+    // CHECK_CUDA(dy_dx);
+    // CHECK_CUDA(grad_inputs);
+    CHECK_CONTIGUOUS(grad);
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(embeddings);
+    CHECK_CONTIGUOUS(offsets);
+    CHECK_CONTIGUOUS(grad_embeddings);
+    // CHECK_CONTIGUOUS(dy_dx);
+    // CHECK_CONTIGUOUS(grad_inputs);
+    CHECK_IS_FLOATING(grad);
+    CHECK_IS_FLOATING(inputs);
+    CHECK_IS_FLOATING(embeddings);
+    CHECK_IS_INT(offsets);
+    CHECK_IS_FLOATING(grad_embeddings);
+    // CHECK_IS_FLOATING(dy_dx);
+    // CHECK_IS_FLOATING(grad_inputs);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    grad.scalar_type(), "grid_encode_backward", ([&] {
+        grid_encode_backward_cuda<scalar_t>(grad.data_ptr<scalar_t>(), inputs.data_ptr<float>(), embeddings.data_ptr<scalar_t>(), offsets.data_ptr<int>(), grad_embeddings.data_ptr<scalar_t>(), B, D, C, L, S, H, dy_dx.has_value() ? dy_dx.value().data_ptr<scalar_t>() : nullptr, grad_inputs.has_value() ? grad_inputs.value().data_ptr<scalar_t>() : nullptr, gridtype, align_corners);
+    }));
+}

gridencoder/src/gridencoder.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#ifndef _HASH_ENCODE_H
+#define _HASH_ENCODE_H
+#include <stdint.h>
+#include <torch/torch.h>
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [B, L * C], float
+// H: base resolution
+void grid_encode_forward(const at::Tensor inputs, const at::Tensor embeddings, const at::Tensor offsets, at::Tensor outputs, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, at::optional<at::Tensor> dy_dx, const uint32_t gridtype, const bool align_corners);
+void grid_encode_backward(const at::Tensor grad, const at::Tensor inputs, const at::Tensor embeddings, const at::Tensor offsets, at::Tensor grad_embeddings, const uint32_t B, const uint32_t D, const uint32_t C, const uint32_t L, const float S, const uint32_t H, const at::optional<at::Tensor> dy_dx, at::optional<at::Tensor> grad_inputs, const uint32_t gridtype, const bool align_corners);
+#endif

main.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import torch
+import argparse
+from nerf.provider import NeRFDataset
+from nerf.utils import *
+from optimizer import Shampoo
+from nerf.gui import NeRFGUI
+# torch.autograd.set_detect_anomaly(True)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--text', default=None, help="text prompt")
+    parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --dir_text")
+    parser.add_argument('-O2', action='store_true', help="equals --fp16 --dir_text")
+    parser.add_argument('--test', action='store_true', help="test mode")
+    parser.add_argument('--save_mesh', action='store_true', help="export an obj mesh with texture")
+    parser.add_argument('--eval_interval', type=int, default=10, help="evaluate on the valid set every interval epochs")
+    parser.add_argument('--workspace', type=str, default='workspace')
+    parser.add_argument('--guidance', type=str, default='stable-diffusion', help='choose from [stable-diffusion, clip]')
+    parser.add_argument('--seed', type=int, default=0)
+    ### training options
+    parser.add_argument('--iters', type=int, default=10000, help="training iters")
+    parser.add_argument('--lr', type=float, default=1e-3, help="initial learning rate")
+    parser.add_argument('--ckpt', type=str, default='latest')
+    parser.add_argument('--cuda_ray', action='store_true', help="use CUDA raymarching instead of pytorch")
+    parser.add_argument('--max_steps', type=int, default=1024, help="max num steps sampled per ray (only valid when using --cuda_ray)")
+    parser.add_argument('--num_steps', type=int, default=64, help="num steps sampled per ray (only valid when not using --cuda_ray)")
+    parser.add_argument('--upsample_steps', type=int, default=64, help="num steps up-sampled per ray (only valid when not using --cuda_ray)")
+    parser.add_argument('--update_extra_interval', type=int, default=16, help="iter interval to update extra status (only valid when using --cuda_ray)")
+    parser.add_argument('--max_ray_batch', type=int, default=4096, help="batch size of rays at inference to avoid OOM (only valid when not using --cuda_ray)")
+    parser.add_argument('--albedo_iters', type=int, default=1000, help="training iters that only use albedo shading")
+    # model options
+    parser.add_argument('--bg_radius', type=float, default=1.4, help="if positive, use a background model at sphere(bg_radius)")
+    parser.add_argument('--density_thresh', type=float, default=10, help="threshold for density grid to be occupied")
+    # network backbone
+    parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
+    parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, tcnn, vanilla]")
+    # rendering resolution in training, decrease this if CUDA OOM.
+    parser.add_argument('--w', type=int, default=64, help="render width for NeRF in training")
+    parser.add_argument('--h', type=int, default=64, help="render height for NeRF in training")
+    parser.add_argument('--jitter_pose', action='store_true', help="add jitters to the randomly sampled camera poses")
+    ### dataset options
+    parser.add_argument('--bound', type=float, default=1, help="assume the scene is bounded in box(-bound, bound)")
+    parser.add_argument('--dt_gamma', type=float, default=0, help="dt_gamma (>=0) for adaptive ray marching. set to 0 to disable, >0 to accelerate rendering (but usually with worse quality)")
+    parser.add_argument('--min_near', type=float, default=0.1, help="minimum near distance for camera")
+    parser.add_argument('--radius_range', type=float, nargs='*', default=[1.0, 1.5], help="training camera radius range")
+    parser.add_argument('--fovy_range', type=float, nargs='*', default=[40, 70], help="training camera fovy range")
+    parser.add_argument('--dir_text', action='store_true', help="direction-encode the text prompt, by appending front/side/back/overhead view")
+    parser.add_argument('--angle_overhead', type=float, default=30, help="[0, angle_overhead] is the overhead region")
+    parser.add_argument('--angle_front', type=float, default=60, help="[0, angle_front] is the front region, [180, 180+angle_front] the back region, otherwise the side region.")
+    parser.add_argument('--lambda_entropy', type=float, default=1e-4, help="loss scale for alpha entropy")
+    parser.add_argument('--lambda_opacity', type=float, default=0, help="loss scale for alpha value")
+    parser.add_argument('--lambda_orient', type=float, default=1e-2, help="loss scale for orientation")
+    ### GUI options
+    parser.add_argument('--gui', action='store_true', help="start a GUI")
+    parser.add_argument('--W', type=int, default=800, help="GUI width")
+    parser.add_argument('--H', type=int, default=800, help="GUI height")
+    parser.add_argument('--radius', type=float, default=3, help="default GUI camera radius from center")
+    parser.add_argument('--fovy', type=float, default=60, help="default GUI camera fovy")
+    parser.add_argument('--light_theta', type=float, default=60, help="default GUI light direction in [0, 180], corresponding to elevation [90, -90]")
+    parser.add_argument('--light_phi', type=float, default=0, help="default GUI light direction in [0, 360), azimuth")
+    parser.add_argument('--max_spp', type=int, default=1, help="GUI rendering max sample per pixel")
+    opt = parser.parse_args()
+    if opt.O:
+        opt.fp16 = True
+        opt.dir_text = True
+        # use occupancy grid to prune ray sampling, faster rendering.
+        opt.cuda_ray = True
+        # opt.lambda_entropy = 1e-4
+        # opt.lambda_opacity = 0
+    elif opt.O2:
+        opt.fp16 = True
+        opt.dir_text = True
+        opt.lambda_entropy = 1e-4 # necessary to keep non-empty
+        opt.lambda_opacity = 3e-3 # no occupancy grid, so use a stronger opacity loss.
+    if opt.backbone == 'vanilla':
+        from nerf.network import NeRFNetwork
+    elif opt.backbone == 'tcnn':
+        from nerf.network_tcnn import NeRFNetwork
+    elif opt.backbone == 'grid':
+        from nerf.network_grid import NeRFNetwork
+    else:
+        raise NotImplementedError(f'--backbone {opt.backbone} is not implemented!')
+    print(opt)
+    seed_everything(opt.seed)
+    model = NeRFNetwork(opt)
+    print(model)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if opt.test:
+        guidance = None # no need to load guidance model at test
+        trainer = Trainer('df', opt, model, guidance, device=device, workspace=opt.workspace, fp16=opt.fp16, use_checkpoint=opt.ckpt)
+        if opt.gui:
+            gui = NeRFGUI(opt, trainer)
+            gui.render()
+        else:
+            test_loader = NeRFDataset(opt, device=device, type='test', H=opt.H, W=opt.W, size=100).dataloader()
+            trainer.test(test_loader)
+            if opt.save_mesh:
+                trainer.save_mesh(resolution=256)
+    else:
+        if opt.guidance == 'stable-diffusion':
+            from nerf.sd import StableDiffusion
+            guidance = StableDiffusion(device)
+        elif opt.guidance == 'clip':
+            from nerf.clip import CLIP
+            guidance = CLIP(device)
+        else:
+            raise NotImplementedError(f'--guidance {opt.guidance} is not implemented.')
+        optimizer = lambda model: torch.optim.Adam(model.get_params(opt.lr), betas=(0.9, 0.99), eps=1e-15)
+        # optimizer = lambda model: Shampoo(model.get_params(opt.lr))
+        train_loader = NeRFDataset(opt, device=device, type='train', H=opt.h, W=opt.w, size=100).dataloader()
+        scheduler = lambda optimizer: optim.lr_scheduler.LambdaLR(optimizer, lambda iter: 0.1 ** min(iter / opt.iters, 1))
+        # scheduler = lambda optimizer: optim.lr_scheduler.OneCycleLR(optimizer, max_lr=opt.lr, total_steps=opt.iters, pct_start=0.1)
+        trainer = Trainer('df', opt, model, guidance, device=device, workspace=opt.workspace, optimizer=optimizer, ema_decay=None, fp16=opt.fp16, lr_scheduler=scheduler, use_checkpoint=opt.ckpt, eval_interval=opt.eval_interval, scheduler_update_every_step=True)
+        if opt.gui:
+            trainer.train_loader = train_loader # attach dataloader to trainer
+            gui = NeRFGUI(opt, trainer)
+            gui.render()
+        else:
+            valid_loader = NeRFDataset(opt, device=device, type='val', H=opt.H, W=opt.W, size=5).dataloader()
+            max_epoch = np.ceil(opt.iters / len(train_loader)).astype(np.int32)
+            trainer.train(train_loader, valid_loader, max_epoch)
+            # also test
+            test_loader = NeRFDataset(opt, device=device, type='test', H=opt.H, W=opt.W, size=100).dataloader()
+            trainer.test(test_loader)
+            if opt.save_mesh:
+                trainer.save_mesh(resolution=256)

nerf/clip.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+import clip
+class CLIP(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.device = device
+        self.clip_model, self.clip_preprocess = clip.load("ViT-B/16", device=self.device, jit=False)
+         # image augmentation
+        self.aug = T.Compose([
+            T.Resize((224, 224)),
+            T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+        # self.gaussian_blur = T.GaussianBlur(15, sigma=(0.1, 10))
+    def get_text_embeds(self, prompt):
+        text = clip.tokenize(prompt).to(self.device)
+        text_z = self.clip_model.encode_text(text)
+        text_z = text_z / text_z.norm(dim=-1, keepdim=True)
+        return text_z
+    def train_step(self, text_z, pred_rgb):
+        pred_rgb = self.aug(pred_rgb)
+        image_z = self.clip_model.encode_image(pred_rgb)
+        image_z = image_z / image_z.norm(dim=-1, keepdim=True) # normalize features
+        loss = - (image_z * text_z).sum(-1).mean()
+        return loss

nerf/gui.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import math
+import torch
+import numpy as np
+import dearpygui.dearpygui as dpg
+from scipy.spatial.transform import Rotation as R
+from nerf.utils import *
+class OrbitCamera:
+    def __init__(self, W, H, r=2, fovy=60):
+        self.W = W
+        self.H = H
+        self.radius = r # camera distance from center
+        self.fovy = fovy # in degree
+        self.center = np.array([0, 0, 0], dtype=np.float32) # look at this point
+        self.rot = R.from_quat([1, 0, 0, 0]) # init camera matrix: [[1, 0, 0], [0, -1, 0], [0, 0, 1]] (to suit ngp convention)
+        self.up = np.array([0, 1, 0], dtype=np.float32) # need to be normalized!
+    # pose
+    @property
+    def pose(self):
+        # first move camera to radius
+        res = np.eye(4, dtype=np.float32)
+        res[2, 3] -= self.radius
+        # rotate
+        rot = np.eye(4, dtype=np.float32)
+        rot[:3, :3] = self.rot.as_matrix()
+        res = rot @ res
+        # translate
+        res[:3, 3] -= self.center
+        return res
+    # intrinsics
+    @property
+    def intrinsics(self):
+        focal = self.H / (2 * np.tan(np.deg2rad(self.fovy) / 2))
+        return np.array([focal, focal, self.W // 2, self.H // 2])
+    def orbit(self, dx, dy):
+        # rotate along camera up/side axis!
+        side = self.rot.as_matrix()[:3, 0] # why this is side --> ? # already normalized.
+        rotvec_x = self.up * np.deg2rad(-0.1 * dx)
+        rotvec_y = side * np.deg2rad(-0.1 * dy)
+        self.rot = R.from_rotvec(rotvec_x) * R.from_rotvec(rotvec_y) * self.rot
+    def scale(self, delta):
+        self.radius *= 1.1 ** (-delta)
+    def pan(self, dx, dy, dz=0):
+        # pan in camera coordinate system (careful on the sensitivity!)
+        self.center += 0.0005 * self.rot.as_matrix()[:3, :3] @ np.array([dx, dy, dz])
+class NeRFGUI:
+    def __init__(self, opt, trainer, debug=True):
+        self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
+        self.W = opt.W
+        self.H = opt.H
+        self.cam = OrbitCamera(opt.W, opt.H, r=opt.radius, fovy=opt.fovy)
+        self.debug = debug
+        self.bg_color = torch.ones(3, dtype=torch.float32) # default white bg
+        self.training = False
+        self.step = 0 # training step
+        self.trainer = trainer
+        self.render_buffer = np.zeros((self.W, self.H, 3), dtype=np.float32)
+        self.need_update = True # camera moved, should reset accumulation
+        self.spp = 1 # sample per pixel
+        self.light_dir = np.array([opt.light_theta, opt.light_phi])
+        self.ambient_ratio = 1.0
+        self.mode = 'image' # choose from ['image', 'depth']
+        self.shading = 'albedo'
+        self.dynamic_resolution = True
+        self.downscale = 1
+        self.train_steps = 16
+        dpg.create_context()
+        self.register_dpg()
+        self.test_step()
+    def __del__(self):
+        dpg.destroy_context()
+    def train_step(self):
+        starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+        starter.record()
+        outputs = self.trainer.train_gui(self.trainer.train_loader, step=self.train_steps)
+        ender.record()
+        torch.cuda.synchronize()
+        t = starter.elapsed_time(ender)
+        self.step += self.train_steps
+        self.need_update = True
+        dpg.set_value("_log_train_time", f'{t:.4f}ms ({int(1000/t)} FPS)')
+        dpg.set_value("_log_train_log", f'step = {self.step: 5d} (+{self.train_steps: 2d}), loss = {outputs["loss"]:.4f}, lr = {outputs["lr"]:.5f}')
+        # dynamic train steps
+        # max allowed train time per-frame is 500 ms
+        full_t = t / self.train_steps * 16
+        train_steps = min(16, max(4, int(16 * 500 / full_t)))
+        if train_steps > self.train_steps * 1.2 or train_steps < self.train_steps * 0.8:
+            self.train_steps = train_steps
+    def prepare_buffer(self, outputs):
+        if self.mode == 'image':
+            return outputs['image']
+        else:
+            return np.expand_dims(outputs['depth'], -1).repeat(3, -1)
+    def test_step(self):
+        if self.need_update or self.spp < self.opt.max_spp:
+            starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+            starter.record()
+            outputs = self.trainer.test_gui(self.cam.pose, self.cam.intrinsics, self.W, self.H, self.bg_color, self.spp, self.downscale, self.light_dir, self.ambient_ratio, self.shading)
+            ender.record()
+            torch.cuda.synchronize()
+            t = starter.elapsed_time(ender)
+            # update dynamic resolution
+            if self.dynamic_resolution:
+                # max allowed infer time per-frame is 200 ms
+                full_t = t / (self.downscale ** 2)
+                downscale = min(1, max(1/4, math.sqrt(200 / full_t)))
+                if downscale > self.downscale * 1.2 or downscale < self.downscale * 0.8:
+                    self.downscale = downscale
+            if self.need_update:
+                self.render_buffer = self.prepare_buffer(outputs)
+                self.spp = 1
+                self.need_update = False
+            else:
+                self.render_buffer = (self.render_buffer * self.spp + self.prepare_buffer(outputs)) / (self.spp + 1)
+                self.spp += 1
+            dpg.set_value("_log_infer_time", f'{t:.4f}ms ({int(1000/t)} FPS)')
+            dpg.set_value("_log_resolution", f'{int(self.downscale * self.W)}x{int(self.downscale * self.H)}')
+            dpg.set_value("_log_spp", self.spp)
+            dpg.set_value("_texture", self.render_buffer)
+    def register_dpg(self):
+        ### register texture
+        with dpg.texture_registry(show=False):
+            dpg.add_raw_texture(self.W, self.H, self.render_buffer, format=dpg.mvFormat_Float_rgb, tag="_texture")
+        ### register window
+        # the rendered image, as the primary window
+        with dpg.window(tag="_primary_window", width=self.W, height=self.H):
+            # add the texture
+            dpg.add_image("_texture")
+        dpg.set_primary_window("_primary_window", True)
+        # control window
+        with dpg.window(label="Control", tag="_control_window", width=400, height=300):
+            # text prompt
+            if self.opt.text is not None:
+                dpg.add_text("text: " + self.opt.text, tag="_log_prompt_text")
+            # button theme
+            with dpg.theme() as theme_button:
+                with dpg.theme_component(dpg.mvButton):
+                    dpg.add_theme_color(dpg.mvThemeCol_Button, (23, 3, 18))
+                    dpg.add_theme_color(dpg.mvThemeCol_ButtonHovered, (51, 3, 47))
+                    dpg.add_theme_color(dpg.mvThemeCol_ButtonActive, (83, 18, 83))
+                    dpg.add_theme_style(dpg.mvStyleVar_FrameRounding, 5)
+                    dpg.add_theme_style(dpg.mvStyleVar_FramePadding, 3, 3)
+            # time
+            if not self.opt.test:
+                with dpg.group(horizontal=True):
+                    dpg.add_text("Train time: ")
+                    dpg.add_text("no data", tag="_log_train_time")
+            with dpg.group(horizontal=True):
+                dpg.add_text("Infer time: ")
+                dpg.add_text("no data", tag="_log_infer_time")
+            with dpg.group(horizontal=True):
+                dpg.add_text("SPP: ")
+                dpg.add_text("1", tag="_log_spp")
+            # train button
+            if not self.opt.test:
+                with dpg.collapsing_header(label="Train", default_open=True):
+                    with dpg.group(horizontal=True):
+                        dpg.add_text("Train: ")
+                        def callback_train(sender, app_data):
+                            if self.training:
+                                self.training = False
+                                dpg.configure_item("_button_train", label="start")
+                            else:
+                                self.training = True
+                                dpg.configure_item("_button_train", label="stop")
+                        dpg.add_button(label="start", tag="_button_train", callback=callback_train)
+                        dpg.bind_item_theme("_button_train", theme_button)
+                        def callback_reset(sender, app_data):
+                            @torch.no_grad()
+                            def weight_reset(m: nn.Module):
+                                reset_parameters = getattr(m, "reset_parameters", None)
+                                if callable(reset_parameters):
+                                    m.reset_parameters()
+                            self.trainer.model.apply(fn=weight_reset)
+                            self.trainer.model.reset_extra_state() # for cuda_ray density_grid and step_counter
+                            self.need_update = True
+                        dpg.add_button(label="reset", tag="_button_reset", callback=callback_reset)
+                        dpg.bind_item_theme("_button_reset", theme_button)
+                    with dpg.group(horizontal=True):
+                        dpg.add_text("Checkpoint: ")
+                        def callback_save(sender, app_data):
+                            self.trainer.save_checkpoint(full=True, best=False)
+                            dpg.set_value("_log_ckpt", "saved " + os.path.basename(self.trainer.stats["checkpoints"][-1]))
+                            self.trainer.epoch += 1 # use epoch to indicate different calls.
+                        dpg.add_button(label="save", tag="_button_save", callback=callback_save)
+                        dpg.bind_item_theme("_button_save", theme_button)
+                        dpg.add_text("", tag="_log_ckpt")
+                    # save mesh
+                    with dpg.group(horizontal=True):
+                        dpg.add_text("Marching Cubes: ")
+                        def callback_mesh(sender, app_data):
+                            self.trainer.save_mesh(resolution=256, threshold=10)
+                            dpg.set_value("_log_mesh", "saved " + f'{self.trainer.name}_{self.trainer.epoch}.ply')
+                            self.trainer.epoch += 1 # use epoch to indicate different calls.
+                        dpg.add_button(label="mesh", tag="_button_mesh", callback=callback_mesh)
+                        dpg.bind_item_theme("_button_mesh", theme_button)
+                        dpg.add_text("", tag="_log_mesh")
+                    with dpg.group(horizontal=True):
+                        dpg.add_text("", tag="_log_train_log")
+            # rendering options
+            with dpg.collapsing_header(label="Options", default_open=True):
+                # dynamic rendering resolution
+                with dpg.group(horizontal=True):
+                    def callback_set_dynamic_resolution(sender, app_data):
+                        if self.dynamic_resolution:
+                            self.dynamic_resolution = False
+                            self.downscale = 1
+                        else:
+                            self.dynamic_resolution = True
+                        self.need_update = True
+                    dpg.add_checkbox(label="dynamic resolution", default_value=self.dynamic_resolution, callback=callback_set_dynamic_resolution)
+                    dpg.add_text(f"{self.W}x{self.H}", tag="_log_resolution")
+                # mode combo
+                def callback_change_mode(sender, app_data):
+                    self.mode = app_data
+                    self.need_update = True
+                dpg.add_combo(('image', 'depth'), label='mode', default_value=self.mode, callback=callback_change_mode)
+                # bg_color picker
+                def callback_change_bg(sender, app_data):
+                    self.bg_color = torch.tensor(app_data[:3], dtype=torch.float32) # only need RGB in [0, 1]
+                    self.need_update = True
+                dpg.add_color_edit((255, 255, 255), label="Background Color", width=200, tag="_color_editor", no_alpha=True, callback=callback_change_bg)
+                # fov slider
+                def callback_set_fovy(sender, app_data):
+                    self.cam.fovy = app_data
+                    self.need_update = True
+                dpg.add_slider_int(label="FoV (vertical)", min_value=1, max_value=120, format="%d deg", default_value=self.cam.fovy, callback=callback_set_fovy)
+                # dt_gamma slider
+                def callback_set_dt_gamma(sender, app_data):
+                    self.opt.dt_gamma = app_data
+                    self.need_update = True
+                dpg.add_slider_float(label="dt_gamma", min_value=0, max_value=0.1, format="%.5f", default_value=self.opt.dt_gamma, callback=callback_set_dt_gamma)
+                # max_steps slider
+                def callback_set_max_steps(sender, app_data):
+                    self.opt.max_steps = app_data
+                    self.need_update = True
+                dpg.add_slider_int(label="max steps", min_value=1, max_value=1024, format="%d", default_value=self.opt.max_steps, callback=callback_set_max_steps)
+                # aabb slider
+                def callback_set_aabb(sender, app_data, user_data):
+                    # user_data is the dimension for aabb (xmin, ymin, zmin, xmax, ymax, zmax)
+                    self.trainer.model.aabb_infer[user_data] = app_data
+                    # also change train aabb ? [better not...]
+                    #self.trainer.model.aabb_train[user_data] = app_data
+                    self.need_update = True
+                dpg.add_separator()
+                dpg.add_text("Axis-aligned bounding box:")
+                with dpg.group(horizontal=True):
+                    dpg.add_slider_float(label="x", width=150, min_value=-self.opt.bound, max_value=0, format="%.2f", default_value=-self.opt.bound, callback=callback_set_aabb, user_data=0)
+                    dpg.add_slider_float(label="", width=150, min_value=0, max_value=self.opt.bound, format="%.2f", default_value=self.opt.bound, callback=callback_set_aabb, user_data=3)
+                with dpg.group(horizontal=True):
+                    dpg.add_slider_float(label="y", width=150, min_value=-self.opt.bound, max_value=0, format="%.2f", default_value=-self.opt.bound, callback=callback_set_aabb, user_data=1)
+                    dpg.add_slider_float(label="", width=150, min_value=0, max_value=self.opt.bound, format="%.2f", default_value=self.opt.bound, callback=callback_set_aabb, user_data=4)
+                with dpg.group(horizontal=True):
+                    dpg.add_slider_float(label="z", width=150, min_value=-self.opt.bound, max_value=0, format="%.2f", default_value=-self.opt.bound, callback=callback_set_aabb, user_data=2)
+                    dpg.add_slider_float(label="", width=150, min_value=0, max_value=self.opt.bound, format="%.2f", default_value=self.opt.bound, callback=callback_set_aabb, user_data=5)
+                # light dir
+                def callback_set_light_dir(sender, app_data, user_data):
+                    self.light_dir[user_data] = app_data
+                    self.need_update = True
+                dpg.add_separator()
+                dpg.add_text("Plane Light Direction:")
+                with dpg.group(horizontal=True):
+                    dpg.add_slider_float(label="theta", min_value=0, max_value=180, format="%.2f", default_value=self.opt.light_theta, callback=callback_set_light_dir, user_data=0)
+                with dpg.group(horizontal=True):
+                    dpg.add_slider_float(label="phi", min_value=0, max_value=360, format="%.2f", default_value=self.opt.light_phi, callback=callback_set_light_dir, user_data=1)
+                # ambient ratio
+                def callback_set_abm_ratio(sender, app_data):
+                    self.ambient_ratio = app_data
+                    self.need_update = True
+                dpg.add_slider_float(label="ambient", min_value=0, max_value=1.0, format="%.5f", default_value=self.ambient_ratio, callback=callback_set_abm_ratio)
+                # shading mode
+                def callback_change_shading(sender, app_data):
+                    self.shading = app_data
+                    self.need_update = True
+                dpg.add_combo(('albedo', 'lambertian', 'textureless', 'normal'), label='shading', default_value=self.shading, callback=callback_change_shading)
+            # debug info
+            if self.debug:
+                with dpg.collapsing_header(label="Debug"):
+                    # pose
+                    dpg.add_separator()
+                    dpg.add_text("Camera Pose:")
+                    dpg.add_text(str(self.cam.pose), tag="_log_pose")
+        ### register camera handler
+        def callback_camera_drag_rotate(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            dx = app_data[1]
+            dy = app_data[2]
+            self.cam.orbit(dx, dy)
+            self.need_update = True
+            if self.debug:
+                dpg.set_value("_log_pose", str(self.cam.pose))
+        def callback_camera_wheel_scale(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            delta = app_data
+            self.cam.scale(delta)
+            self.need_update = True
+            if self.debug:
+                dpg.set_value("_log_pose", str(self.cam.pose))
+        def callback_camera_drag_pan(sender, app_data):
+            if not dpg.is_item_focused("_primary_window"):
+                return
+            dx = app_data[1]
+            dy = app_data[2]
+            self.cam.pan(dx, dy)
+            self.need_update = True
+            if self.debug:
+                dpg.set_value("_log_pose", str(self.cam.pose))
+        with dpg.handler_registry():
+            dpg.add_mouse_drag_handler(button=dpg.mvMouseButton_Left, callback=callback_camera_drag_rotate)
+            dpg.add_mouse_wheel_handler(callback=callback_camera_wheel_scale)
+            dpg.add_mouse_drag_handler(button=dpg.mvMouseButton_Middle, callback=callback_camera_drag_pan)
+        dpg.create_viewport(title='torch-ngp', width=self.W, height=self.H, resizable=False)
+        # TODO: seems dearpygui doesn't support resizing texture...
+        # def callback_resize(sender, app_data):
+        #     self.W = app_data[0]
+        #     self.H = app_data[1]
+        #     # how to reload texture ???
+        # dpg.set_viewport_resize_callback(callback_resize)
+        ### global theme
+        with dpg.theme() as theme_no_padding:
+            with dpg.theme_component(dpg.mvAll):
+                # set all padding to 0 to avoid scroll bar
+                dpg.add_theme_style(dpg.mvStyleVar_WindowPadding, 0, 0, category=dpg.mvThemeCat_Core)
+                dpg.add_theme_style(dpg.mvStyleVar_FramePadding, 0, 0, category=dpg.mvThemeCat_Core)
+                dpg.add_theme_style(dpg.mvStyleVar_CellPadding, 0, 0, category=dpg.mvThemeCat_Core)
+        dpg.bind_item_theme("_primary_window", theme_no_padding)
+        dpg.setup_dearpygui()
+        #dpg.show_metrics()
+        dpg.show_viewport()
+    def render(self):
+        while dpg.is_dearpygui_running():
+            # update texture every frame
+            if self.training:
+                self.train_step()
+            self.test_step()
+            dpg.render_dearpygui_frame()

nerf/network.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from activation import trunc_exp
+from .renderer import NeRFRenderer
+import numpy as np
+from encoding import get_encoder
+from .utils import safe_normalize
+class MLP(nn.Module):
+    def __init__(self, dim_in, dim_out, dim_hidden, num_layers, bias=True):
+        super().__init__()
+        self.dim_in = dim_in
+        self.dim_out = dim_out
+        self.dim_hidden = dim_hidden
+        self.num_layers = num_layers
+        net = []
+        for l in range(num_layers):
+            net.append(nn.Linear(self.dim_in if l == 0 else self.dim_hidden, self.dim_out if l == num_layers - 1 else self.dim_hidden, bias=bias))
+        self.net = nn.ModuleList(net)
+    def forward(self, x):
+        for l in range(self.num_layers):
+            x = self.net[l](x)
+            if l != self.num_layers - 1:
+                x = F.relu(x, inplace=True)
+        return x
+class NeRFNetwork(NeRFRenderer):
+    def __init__(self,
+                 opt,
+                 num_layers=5,
+                 hidden_dim=128,
+                 num_layers_bg=2,
+                 hidden_dim_bg=64,
+                 ):
+        super().__init__(opt)
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.encoder, self.in_dim = get_encoder('frequency', input_dim=3)
+        self.sigma_net = MLP(self.in_dim, 4, hidden_dim, num_layers, bias=True)
+        # background network
+        if self.bg_radius > 0:
+            self.num_layers_bg = num_layers_bg
+            self.hidden_dim_bg = hidden_dim_bg
+            self.encoder_bg, self.in_dim_bg = get_encoder('frequency', input_dim=3)
+            self.bg_net = MLP(self.in_dim_bg, 3, hidden_dim_bg, num_layers_bg, bias=True)
+        else:
+            self.bg_net = None
+    def gaussian(self, x):
+        # x: [B, N, 3]
+        d = (x ** 2).sum(-1)
+        g = 5 * torch.exp(-d / (2 * 0.2 ** 2))
+        return g
+    def common_forward(self, x):
+        # x: [N, 3], in [-bound, bound]
+        # sigma
+        h = self.encoder(x, bound=self.bound)
+        h = self.sigma_net(h)
+        sigma = trunc_exp(h[..., 0] + self.gaussian(x))
+        albedo = torch.sigmoid(h[..., 1:])
+        return sigma, albedo
+    # ref: https://github.com/zhaofuq/Instant-NSR/blob/main/nerf/network_sdf.py#L192
+    def finite_difference_normal(self, x, epsilon=1e-2):
+        # x: [N, 3]
+        dx_pos, _ = self.common_forward((x + torch.tensor([[epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
+        dx_neg, _ = self.common_forward((x + torch.tensor([[-epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
+        dy_pos, _ = self.common_forward((x + torch.tensor([[0.00, epsilon, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
+        dy_neg, _ = self.common_forward((x + torch.tensor([[0.00, -epsilon, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
+        dz_pos, _ = self.common_forward((x + torch.tensor([[0.00, 0.00, epsilon]], device=x.device)).clamp(-self.bound, self.bound))
+        dz_neg, _ = self.common_forward((x + torch.tensor([[0.00, 0.00, -epsilon]], device=x.device)).clamp(-self.bound, self.bound))
+        normal = torch.stack([
+            0.5 * (dx_pos - dx_neg) / epsilon,
+            0.5 * (dy_pos - dy_neg) / epsilon,
+            0.5 * (dz_pos - dz_neg) / epsilon
+        ], dim=-1)
+        return normal
+    def forward(self, x, d, l=None, ratio=1, shading='albedo'):
+        # x: [N, 3], in [-bound, bound]
+        # d: [N, 3], view direction, nomalized in [-1, 1]
+        # l: [3], plane light direction, nomalized in [-1, 1]
+        # ratio: scalar, ambient ratio, 1 == no shading (albedo only), 0 == only shading (textureless)
+        if shading == 'albedo':
+            # no need to query normal
+            sigma, color = self.common_forward(x)
+            normal = None
+        else:
+            # query normal
+            # sigma, albedo = self.common_forward(x)
+            # normal = self.finite_difference_normal(x)
+            with torch.enable_grad():
+                x.requires_grad_(True)
+                sigma, albedo = self.common_forward(x)
+                # query gradient
+                normal = - torch.autograd.grad(torch.sum(sigma), x, create_graph=True)[0] # [N, 3]
+            # normalize...
+            normal = safe_normalize(normal)
+            normal[torch.isnan(normal)] = 0
+            # lambertian shading
+            lambertian = ratio + (1 - ratio) * (normal @ -l).clamp(min=0) # [N,]
+            if shading == 'textureless':
+                color = lambertian.unsqueeze(-1).repeat(1, 3)
+            elif shading == 'normal':
+                color = (normal + 1) / 2
+            else: # 'lambertian'
+                color = albedo * lambertian.unsqueeze(-1)
+        return sigma, color, normal
+    def density(self, x):
+        # x: [N, 3], in [-bound, bound]
+        sigma, albedo = self.common_forward(x)
+        return {
+            'sigma': sigma,
+            'albedo': albedo,
+        }
+    def background(self, d):
+        h = self.encoder_bg(d) # [N, C]
+        h = self.bg_net(h)
+        # sigmoid activation for rgb
+        rgbs = torch.sigmoid(h)
+        return rgbs
+    # optimizer utils
+    def get_params(self, lr):
+        params = [
+            # {'params': self.encoder.parameters(), 'lr': lr * 10},
+            {'params': self.sigma_net.parameters(), 'lr': lr},
+        ]
+        if self.bg_radius > 0:
+            # params.append({'params': self.encoder_bg.parameters(), 'lr': lr * 10})
+            params.append({'params': self.bg_net.parameters(), 'lr': lr})
+        return params

nerf/network_grid.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from activation import trunc_exp
+from .renderer import NeRFRenderer
+import numpy as np
+from encoding import get_encoder
+from .utils import safe_normalize
+class MLP(nn.Module):
+    def __init__(self, dim_in, dim_out, dim_hidden, num_layers, bias=True):
+        super().__init__()
+        self.dim_in = dim_in
+        self.dim_out = dim_out
+        self.dim_hidden = dim_hidden
+        self.num_layers = num_layers
+        net = []
+        for l in range(num_layers):
+            net.append(nn.Linear(self.dim_in if l == 0 else self.dim_hidden, self.dim_out if l == num_layers - 1 else self.dim_hidden, bias=bias))
+        self.net = nn.ModuleList(net)
+    def forward(self, x):
+        for l in range(self.num_layers):
+            x = self.net[l](x)
+            if l != self.num_layers - 1:
+                x = F.relu(x, inplace=True)
+        return x
+class NeRFNetwork(NeRFRenderer):
+    def __init__(self,
+                 opt,
+                 num_layers=3,
+                 hidden_dim=64,
+                 num_layers_bg=2,
+                 hidden_dim_bg=64,
+                 ):
+        super().__init__(opt)
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.encoder, self.in_dim = get_encoder('tiledgrid', input_dim=3, desired_resolution=2048 * self.bound)
+        self.sigma_net = MLP(self.in_dim, 4, hidden_dim, num_layers, bias=True)
+        # background network
+        if self.bg_radius > 0:
+            self.num_layers_bg = num_layers_bg
+            self.hidden_dim_bg = hidden_dim_bg
+            # use a very simple network to avoid it learning the prompt...
+            # self.encoder_bg, self.in_dim_bg = get_encoder('tiledgrid', input_dim=2, num_levels=4, desired_resolution=2048)
+            self.encoder_bg, self.in_dim_bg = get_encoder('frequency', input_dim=3)
+            self.bg_net = MLP(self.in_dim_bg, 3, hidden_dim_bg, num_layers_bg, bias=True)
+        else:
+            self.bg_net = None
+    # add a density blob to the scene center
+    def gaussian(self, x):
+        # x: [B, N, 3]
+        d = (x ** 2).sum(-1)
+        g = 5 * torch.exp(-d / (2 * 0.2 ** 2))
+        return g
+    def common_forward(self, x):
+        # x: [N, 3], in [-bound, bound]
+        # sigma
+        h = self.encoder(x, bound=self.bound)
+        h = self.sigma_net(h)
+        sigma = trunc_exp(h[..., 0] + self.gaussian(x))
+        albedo = torch.sigmoid(h[..., 1:])
+        return sigma, albedo
+    # ref: https://github.com/zhaofuq/Instant-NSR/blob/main/nerf/network_sdf.py#L192
+    def finite_difference_normal(self, x, epsilon=1e-2):
+        # x: [N, 3]
+        dx_pos, _ = self.common_forward((x + torch.tensor([[epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
+        dx_neg, _ = self.common_forward((x + torch.tensor([[-epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
+        dy_pos, _ = self.common_forward((x + torch.tensor([[0.00, epsilon, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
+        dy_neg, _ = self.common_forward((x + torch.tensor([[0.00, -epsilon, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
+        dz_pos, _ = self.common_forward((x + torch.tensor([[0.00, 0.00, epsilon]], device=x.device)).clamp(-self.bound, self.bound))
+        dz_neg, _ = self.common_forward((x + torch.tensor([[0.00, 0.00, -epsilon]], device=x.device)).clamp(-self.bound, self.bound))
+        normal = torch.stack([
+            0.5 * (dx_pos - dx_neg) / epsilon,
+            0.5 * (dy_pos - dy_neg) / epsilon,
+            0.5 * (dz_pos - dz_neg) / epsilon
+        ], dim=-1)
+        return normal
+    def forward(self, x, d, l=None, ratio=1, shading='albedo'):
+        # x: [N, 3], in [-bound, bound]
+        # d: [N, 3], view direction, nomalized in [-1, 1]
+        # l: [3], plane light direction, nomalized in [-1, 1]
+        # ratio: scalar, ambient ratio, 1 == no shading (albedo only), 0 == only shading (textureless)
+        if shading == 'albedo':
+            # no need to query normal
+            sigma, color = self.common_forward(x)
+            normal = None
+        else:
+            # query normal
+            sigma, albedo = self.common_forward(x)
+            normal = self.finite_difference_normal(x)
+            # with torch.enable_grad():
+            #     x.requires_grad_(True)
+            #     sigma, albedo = self.common_forward(x)
+            #     # query gradient
+            #     normal = - torch.autograd.grad(torch.sum(sigma), x, create_graph=True)[0] # [N, 3]
+            # normalize...
+            normal = safe_normalize(normal)
+            normal[torch.isnan(normal)] = 0
+            # lambertian shading
+            lambertian = ratio + (1 - ratio) * (normal @ -l).clamp(min=0) # [N,]
+            if shading == 'textureless':
+                color = lambertian.unsqueeze(-1).repeat(1, 3)
+            elif shading == 'normal':
+                color = (normal + 1) / 2
+            else: # 'lambertian'
+                color = albedo * lambertian.unsqueeze(-1)
+        return sigma, color, normal
+    def density(self, x):
+        # x: [N, 3], in [-bound, bound]
+        sigma, albedo = self.common_forward(x)
+        return {
+            'sigma': sigma,
+            'albedo': albedo,
+        }
+    def background(self, d):
+        h = self.encoder_bg(d) # [N, C]
+        h = self.bg_net(h)
+        # sigmoid activation for rgb
+        rgbs = torch.sigmoid(h)
+        return rgbs
+    # optimizer utils
+    def get_params(self, lr):
+        params = [
+            {'params': self.encoder.parameters(), 'lr': lr * 10},
+            {'params': self.sigma_net.parameters(), 'lr': lr},
+        ]
+        if self.bg_radius > 0:
+            params.append({'params': self.encoder_bg.parameters(), 'lr': lr * 10})
+            params.append({'params': self.bg_net.parameters(), 'lr': lr})
+        return params

nerf/network_tcnn.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from activation import trunc_exp
+from .renderer import NeRFRenderer
+from encoding import get_encoder
+import numpy as np
+import tinycudann as tcnn
+class MLP(nn.Module):
+    def __init__(self, dim_in, dim_out, dim_hidden, num_layers, bias=True):
+        super().__init__()
+        self.dim_in = dim_in
+        self.dim_out = dim_out
+        self.dim_hidden = dim_hidden
+        self.num_layers = num_layers
+        net = []
+        for l in range(num_layers):
+            net.append(nn.Linear(self.dim_in if l == 0 else self.dim_hidden, self.dim_out if l == num_layers - 1 else self.dim_hidden, bias=bias))
+        self.net = nn.ModuleList(net)
+    def forward(self, x):
+        for l in range(self.num_layers):
+            x = self.net[l](x)
+            if l != self.num_layers - 1:
+                x = F.relu(x, inplace=True)
+        return x
+class NeRFNetwork(NeRFRenderer):
+    def __init__(self,
+                 opt,
+                 num_layers=3,
+                 hidden_dim=64,
+                 num_layers_bg=2,
+                 hidden_dim_bg=64,
+                 ):
+        super().__init__(opt)
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        per_level_scale = np.exp2(np.log2(2048 * self.bound / 16) / (16 - 1))
+        self.encoder = tcnn.Encoding(
+            n_input_dims=3,
+            encoding_config={
+                "otype": "HashGrid",
+                "n_levels": 16,
+                "n_features_per_level": 2,
+                "log2_hashmap_size": 19,
+                "base_resolution": 16,
+                "per_level_scale": per_level_scale,
+            },
+        )
+        self.sigma_net = MLP(32, 4, hidden_dim, num_layers, bias=True)
+        # background network
+        if self.bg_radius > 0:
+            self.num_layers_bg = num_layers_bg
+            self.hidden_dim_bg = hidden_dim_bg
+            self.encoder_bg, self.in_dim_bg = get_encoder('frequency', input_dim=3)
+            self.bg_net = MLP(self.in_dim_bg, 3, hidden_dim_bg, num_layers_bg, bias=True)
+        else:
+            self.bg_net = None
+    def gaussian(self, x):
+        # x: [B, N, 3]
+        d = (x ** 2).sum(-1)
+        g = 5 * torch.exp(-d / (2 * 0.2 ** 2))
+        return g
+    def common_forward(self, x):
+        # x: [N, 3], in [-bound, bound]
+        # sigma
+        h = (x + self.bound) / (2 * self.bound) # to [0, 1]
+        h = self.encoder(h)
+        h = self.sigma_net(h)
+        sigma = trunc_exp(h[..., 0] + self.gaussian(x))
+        albedo = torch.sigmoid(h[..., 1:])
+        return sigma, albedo
+    def forward(self, x, d, l=None, ratio=1, shading='albedo'):
+        # x: [N, 3], in [-bound, bound]
+        # d: [N, 3], view direction, nomalized in [-1, 1]
+        # l: [3], plane light direction, nomalized in [-1, 1]
+        # ratio: scalar, ambient ratio, 1 == no shading (albedo only)
+        if shading == 'albedo':
+            # no need to query normal
+            sigma, color = self.common_forward(x)
+            normal = None
+        else:
+            # query normal
+            has_grad = torch.is_grad_enabled()
+            with torch.enable_grad():
+                x.requires_grad_(True)
+                sigma, albedo = self.common_forward(x)
+                # query gradient
+                normal = torch.autograd.grad(torch.sum(sigma), x, create_graph=True)[0] # [N, 3]
+                # normalize...
+                normal = normal / (torch.norm(normal, dim=-1, keepdim=True) + 1e-9)
+                normal[torch.isnan(normal)] = 0
+            if not has_grad:
+                normal = normal.detach()
+            # lambertian shading
+            lambertian = ratio + (1 - ratio) * (normal @ l).clamp(min=0) # [N,]
+            if shading == 'textureless':
+                color = lambertian.unsqueeze(-1).repeat(1, 3)
+            elif shading == 'normal':
+                color = (normal + 1) / 2
+            else: # 'lambertian'
+                color = albedo * lambertian.unsqueeze(-1)
+        return sigma, color, normal
+    def density(self, x):
+        # x: [N, 3], in [-bound, bound]
+        sigma, _ = self.common_forward(x)
+        return {
+            'sigma': sigma
+        }
+    def background(self, d):
+        # x: [N, 2], in [-1, 1]
+        h = self.encoder_bg(d) # [N, C]
+        h = self.bg_net(h)
+        # sigmoid activation for rgb
+        rgbs = torch.sigmoid(h)
+        return rgbs
+    # optimizer utils
+    def get_params(self, lr):
+        params = [
+            {'params': self.encoder.parameters(), 'lr': lr * 10},
+            {'params': self.sigma_net.parameters(), 'lr': lr},
+        ]
+        if self.bg_radius > 0:
+            params.append({'params': self.encoder_bg.parameters(), 'lr': lr * 10})
+            params.append({'params': self.bg_net.parameters(), 'lr': lr})
+        return params

nerf/provider.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+import cv2
+import glob
+import json
+import tqdm
+import random
+import numpy as np
+from scipy.spatial.transform import Slerp, Rotation
+import trimesh
+import torch
+from torch.utils.data import DataLoader
+from .utils import get_rays, safe_normalize
+def visualize_poses(poses, size=0.1):
+    # poses: [B, 4, 4]
+    axes = trimesh.creation.axis(axis_length=4)
+    sphere = trimesh.creation.icosphere(radius=1)
+    objects = [axes, sphere]
+    for pose in poses:
+        # a camera is visualized with 8 line segments.
+        pos = pose[:3, 3]
+        a = pos + size * pose[:3, 0] + size * pose[:3, 1] + size * pose[:3, 2]
+        b = pos - size * pose[:3, 0] + size * pose[:3, 1] + size * pose[:3, 2]
+        c = pos - size * pose[:3, 0] - size * pose[:3, 1] + size * pose[:3, 2]
+        d = pos + size * pose[:3, 0] - size * pose[:3, 1] + size * pose[:3, 2]
+        segs = np.array([[pos, a], [pos, b], [pos, c], [pos, d], [a, b], [b, c], [c, d], [d, a]])
+        segs = trimesh.load_path(segs)
+        objects.append(segs)
+    trimesh.Scene(objects).show()
+def get_view_direction(thetas, phis, overhead, front):
+    #                   phis [B,];          thetas: [B,]
+    # front = 0         [0, front)
+    # side (left) = 1   [front, 180)
+    # back = 2          [180, 180+front)
+    # side (right) = 3  [180+front, 360)
+    # top = 4                               [0, overhead]
+    # bottom = 5                            [180-overhead, 180]
+    res = torch.zeros(thetas.shape[0], dtype=torch.long)
+    # first determine by phis
+    res[(phis < front)] = 0
+    res[(phis >= front) & (phis < np.pi)] = 1
+    res[(phis >= np.pi) & (phis < (np.pi + front))] = 2
+    res[(phis >= (np.pi + front))] = 3
+    # override by thetas
+    res[thetas <= overhead] = 4
+    res[thetas >= (np.pi - overhead)] = 5
+    return res
+def rand_poses(size, device, radius_range=[1, 1.5], theta_range=[0, 100], phi_range=[0, 360], return_dirs=False, angle_overhead=30, angle_front=60, jitter=False):
+    ''' generate random poses from an orbit camera
+    Args:
+        size: batch size of generated poses.
+        device: where to allocate the output.
+        radius: camera radius
+        theta_range: [min, max], should be in [0, pi]
+        phi_range: [min, max], should be in [0, 2 * pi]
+    Return:
+        poses: [size, 4, 4]
+    '''
+    theta_range = np.deg2rad(theta_range)
+    phi_range = np.deg2rad(phi_range)
+    angle_overhead = np.deg2rad(angle_overhead)
+    angle_front = np.deg2rad(angle_front)
+    radius = torch.rand(size, device=device) * (radius_range[1] - radius_range[0]) + radius_range[0]
+    thetas = torch.rand(size, device=device) * (theta_range[1] - theta_range[0]) + theta_range[0]
+    phis = torch.rand(size, device=device) * (phi_range[1] - phi_range[0]) + phi_range[0]
+    centers = torch.stack([
+        radius * torch.sin(thetas) * torch.sin(phis),
+        radius * torch.cos(thetas),
+        radius * torch.sin(thetas) * torch.cos(phis),
+    ], dim=-1) # [B, 3]
+    targets = 0
+    # jitters
+    if jitter:
+        centers = centers + (torch.rand_like(centers) * 0.2 - 0.1)
+        targets = targets + torch.randn_like(centers) * 0.2
+    # lookat
+    forward_vector = safe_normalize(targets - centers)
+    up_vector = torch.FloatTensor([0, -1, 0]).to(device).unsqueeze(0).repeat(size, 1)
+    right_vector = safe_normalize(torch.cross(forward_vector, up_vector, dim=-1))
+    if jitter:
+        up_noise = torch.randn_like(up_vector) * 0.02
+    else:
+        up_noise = 0
+    up_vector = safe_normalize(torch.cross(right_vector, forward_vector, dim=-1) + up_noise)
+    poses = torch.eye(4, dtype=torch.float, device=device).unsqueeze(0).repeat(size, 1, 1)
+    poses[:, :3, :3] = torch.stack((right_vector, up_vector, forward_vector), dim=-1)
+    poses[:, :3, 3] = centers
+    if return_dirs:
+        dirs = get_view_direction(thetas, phis, angle_overhead, angle_front)
+    else:
+        dirs = None
+    return poses, dirs
+def circle_poses(device, radius=1.25, theta=60, phi=0, return_dirs=False, angle_overhead=30, angle_front=60):
+    theta = np.deg2rad(theta)
+    phi = np.deg2rad(phi)
+    angle_overhead = np.deg2rad(angle_overhead)
+    angle_front = np.deg2rad(angle_front)
+    thetas = torch.FloatTensor([theta]).to(device)
+    phis = torch.FloatTensor([phi]).to(device)
+    centers = torch.stack([
+        radius * torch.sin(thetas) * torch.sin(phis),
+        radius * torch.cos(thetas),
+        radius * torch.sin(thetas) * torch.cos(phis),
+    ], dim=-1) # [B, 3]
+    # lookat
+    forward_vector = - safe_normalize(centers)
+    up_vector = torch.FloatTensor([0, -1, 0]).to(device).unsqueeze(0)
+    right_vector = safe_normalize(torch.cross(forward_vector, up_vector, dim=-1))
+    up_vector = safe_normalize(torch.cross(right_vector, forward_vector, dim=-1))
+    poses = torch.eye(4, dtype=torch.float, device=device).unsqueeze(0)
+    poses[:, :3, :3] = torch.stack((right_vector, up_vector, forward_vector), dim=-1)
+    poses[:, :3, 3] = centers
+    if return_dirs:
+        dirs = get_view_direction(thetas, phis, angle_overhead, angle_front)
+    else:
+        dirs = None
+    return poses, dirs
+class NeRFDataset:
+    def __init__(self, opt, device, type='train', H=256, W=256, size=100):
+        super().__init__()
+        self.opt = opt
+        self.device = device
+        self.type = type # train, val, test
+        self.H = H
+        self.W = W
+        self.radius_range = opt.radius_range
+        self.fovy_range = opt.fovy_range
+        self.size = size
+        self.training = self.type in ['train', 'all']
+        self.cx = self.H / 2
+        self.cy = self.W / 2
+        # [debug] visualize poses
+        # poses, dirs = rand_poses(100, self.device, return_dirs=self.opt.dir_text, radius_range=self.radius_range)
+        # visualize_poses(poses.detach().cpu().numpy())
+    def collate(self, index):
+        B = len(index) # always 1
+        if self.training:
+            # random pose on the fly
+            poses, dirs = rand_poses(B, self.device, radius_range=self.radius_range, return_dirs=self.opt.dir_text, angle_overhead=self.opt.angle_overhead, angle_front=self.opt.angle_front, jitter=self.opt.jitter_pose)
+            # random focal
+            fov = random.random() * (self.fovy_range[1] - self.fovy_range[0]) + self.fovy_range[0]
+            focal = self.H / (2 * np.tan(np.deg2rad(fov) / 2))
+            intrinsics = np.array([focal, focal, self.cx, self.cy])
+        else:
+            # circle pose
+            phi = (index[0] / self.size) * 360
+            poses, dirs = circle_poses(self.device, radius=self.radius_range[1] * 1.2, theta=60, phi=phi, return_dirs=self.opt.dir_text, angle_overhead=self.opt.angle_overhead, angle_front=self.opt.angle_front)
+            # fixed focal
+            fov = (self.fovy_range[1] + self.fovy_range[0]) / 2
+            focal = self.H / (2 * np.tan(np.deg2rad(fov) / 2))
+            intrinsics = np.array([focal, focal, self.cx, self.cy])
+        # sample a low-resolution but full image for CLIP
+        rays = get_rays(poses, intrinsics, self.H, self.W, -1)
+        data = {
+            'H': self.H,
+            'W': self.W,
+            'rays_o': rays['rays_o'],
+            'rays_d': rays['rays_d'],
+            'dir': dirs,
+        }
+        return data
+    def dataloader(self):
+        loader = DataLoader(list(range(self.size)), batch_size=1, collate_fn=self.collate, shuffle=self.training, num_workers=0)
+        loader._data = self # an ugly fix... we need to access dataset in trainer.
+        return loader

nerf/renderer.py ADDED Viewed

	@@ -0,0 +1,645 @@

+import os
+import math
+import cv2
+import trimesh
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import mcubes
+import raymarching
+from .utils import custom_meshgrid, safe_normalize
+def sample_pdf(bins, weights, n_samples, det=False):
+    # This implementation is from NeRF
+    # bins: [B, T], old_z_vals
+    # weights: [B, T - 1], bin weights.
+    # return: [B, n_samples], new_z_vals
+    # Get pdf
+    weights = weights + 1e-5  # prevent nans
+    pdf = weights / torch.sum(weights, -1, keepdim=True)
+    cdf = torch.cumsum(pdf, -1)
+    cdf = torch.cat([torch.zeros_like(cdf[..., :1]), cdf], -1)
+    # Take uniform samples
+    if det:
+        u = torch.linspace(0. + 0.5 / n_samples, 1. - 0.5 / n_samples, steps=n_samples).to(weights.device)
+        u = u.expand(list(cdf.shape[:-1]) + [n_samples])
+    else:
+        u = torch.rand(list(cdf.shape[:-1]) + [n_samples]).to(weights.device)
+    # Invert CDF
+    u = u.contiguous()
+    inds = torch.searchsorted(cdf, u, right=True)
+    below = torch.max(torch.zeros_like(inds - 1), inds - 1)
+    above = torch.min((cdf.shape[-1] - 1) * torch.ones_like(inds), inds)
+    inds_g = torch.stack([below, above], -1)  # (B, n_samples, 2)
+    matched_shape = [inds_g.shape[0], inds_g.shape[1], cdf.shape[-1]]
+    cdf_g = torch.gather(cdf.unsqueeze(1).expand(matched_shape), 2, inds_g)
+    bins_g = torch.gather(bins.unsqueeze(1).expand(matched_shape), 2, inds_g)
+    denom = (cdf_g[..., 1] - cdf_g[..., 0])
+    denom = torch.where(denom < 1e-5, torch.ones_like(denom), denom)
+    t = (u - cdf_g[..., 0]) / denom
+    samples = bins_g[..., 0] + t * (bins_g[..., 1] - bins_g[..., 0])
+    return samples
+def plot_pointcloud(pc, color=None):
+    # pc: [N, 3]
+    # color: [N, 3/4]
+    print('[visualize points]', pc.shape, pc.dtype, pc.min(0), pc.max(0))
+    pc = trimesh.PointCloud(pc, color)
+    # axis
+    axes = trimesh.creation.axis(axis_length=4)
+    # sphere
+    sphere = trimesh.creation.icosphere(radius=1)
+    trimesh.Scene([pc, axes, sphere]).show()
+class NeRFRenderer(nn.Module):
+    def __init__(self, opt):
+        super().__init__()
+        self.opt = opt
+        self.bound = opt.bound
+        self.cascade = 1 + math.ceil(math.log2(opt.bound))
+        self.grid_size = 128
+        self.cuda_ray = opt.cuda_ray
+        self.min_near = opt.min_near
+        self.density_thresh = opt.density_thresh
+        self.bg_radius = opt.bg_radius
+        # prepare aabb with a 6D tensor (xmin, ymin, zmin, xmax, ymax, zmax)
+        # NOTE: aabb (can be rectangular) is only used to generate points, we still rely on bound (always cubic) to calculate density grid and hashing.
+        aabb_train = torch.FloatTensor([-opt.bound, -opt.bound, -opt.bound, opt.bound, opt.bound, opt.bound])
+        aabb_infer = aabb_train.clone()
+        self.register_buffer('aabb_train', aabb_train)
+        self.register_buffer('aabb_infer', aabb_infer)
+        # extra state for cuda raymarching
+        if self.cuda_ray:
+            # density grid
+            density_grid = torch.zeros([self.cascade, self.grid_size ** 3]) # [CAS, H * H * H]
+            density_bitfield = torch.zeros(self.cascade * self.grid_size ** 3 // 8, dtype=torch.uint8) # [CAS * H * H * H // 8]
+            self.register_buffer('density_grid', density_grid)
+            self.register_buffer('density_bitfield', density_bitfield)
+            self.mean_density = 0
+            self.iter_density = 0
+            # step counter
+            step_counter = torch.zeros(16, 2, dtype=torch.int32) # 16 is hardcoded for averaging...
+            self.register_buffer('step_counter', step_counter)
+            self.mean_count = 0
+            self.local_step = 0
+    def forward(self, x, d):
+        raise NotImplementedError()
+    def density(self, x):
+        raise NotImplementedError()
+    def color(self, x, d, mask=None, **kwargs):
+        raise NotImplementedError()
+    def reset_extra_state(self):
+        if not self.cuda_ray:
+            return
+        # density grid
+        self.density_grid.zero_()
+        self.mean_density = 0
+        self.iter_density = 0
+        # step counter
+        self.step_counter.zero_()
+        self.mean_count = 0
+        self.local_step = 0
+    @torch.no_grad()
+    def export_mesh(self, path, resolution=None, S=128):
+        if resolution is None:
+            resolution = self.grid_size
+        density_thresh = min(self.mean_density, self.density_thresh)
+        sigmas = np.zeros([resolution, resolution, resolution], dtype=np.float32)
+        # query
+        X = torch.linspace(-1, 1, resolution).split(S)
+        Y = torch.linspace(-1, 1, resolution).split(S)
+        Z = torch.linspace(-1, 1, resolution).split(S)
+        for xi, xs in enumerate(X):
+            for yi, ys in enumerate(Y):
+                for zi, zs in enumerate(Z):
+                    xx, yy, zz = custom_meshgrid(xs, ys, zs)
+                    pts = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1) # [S, 3]
+                    val = self.density(pts.to(self.density_bitfield.device))
+                    sigmas[xi * S: xi * S + len(xs), yi * S: yi * S + len(ys), zi * S: zi * S + len(zs)] = val['sigma'].reshape(len(xs), len(ys), len(zs)).detach().cpu().numpy() # [S, 1] --> [x, y, z]
+        vertices, triangles = mcubes.marching_cubes(sigmas, density_thresh)
+        vertices = vertices / (resolution - 1.0) * 2 - 1
+        vertices = vertices.astype(np.float32)
+        triangles = triangles.astype(np.int32)
+        v = torch.from_numpy(vertices).to(self.density_bitfield.device)
+        f = torch.from_numpy(triangles).int().to(self.density_bitfield.device)
+        # mesh = trimesh.Trimesh(vertices, triangles, process=False) # important, process=True leads to seg fault...
+        # mesh.export(os.path.join(path, f'mesh.ply'))
+        # texture?
+        def _export(v, f, h0=2048, w0=2048, ssaa=1, name=''):
+            # v, f: torch Tensor
+            device = v.device
+            v_np = v.cpu().numpy() # [N, 3]
+            f_np = f.cpu().numpy() # [M, 3]
+            print(f'[INFO] running xatlas to unwrap UVs for mesh: v={v_np.shape} f={f_np.shape}')
+            # unwrap uvs
+            import xatlas
+            import nvdiffrast.torch as dr
+            from sklearn.neighbors import NearestNeighbors
+            from scipy.ndimage import binary_dilation, binary_erosion
+            glctx = dr.RasterizeCudaContext()
+            atlas = xatlas.Atlas()
+            atlas.add_mesh(v_np, f_np)
+            chart_options = xatlas.ChartOptions()
+            chart_options.max_iterations = 0 # disable merge_chart for faster unwrap...
+            atlas.generate(chart_options=chart_options)
+            vmapping, ft_np, vt_np = atlas[0] # [N], [M, 3], [N, 2]
+            # vmapping, ft_np, vt_np = xatlas.parametrize(v_np, f_np) # [N], [M, 3], [N, 2]
+            vt = torch.from_numpy(vt_np.astype(np.float32)).float().to(device)
+            ft = torch.from_numpy(ft_np.astype(np.int64)).int().to(device)
+            # render uv maps
+            uv = vt * 2.0 - 1.0 # uvs to range [-1, 1]
+            uv = torch.cat((uv, torch.zeros_like(uv[..., :1]), torch.ones_like(uv[..., :1])), dim=-1) # [N, 4]
+            if ssaa > 1:
+                h = int(h0 * ssaa)
+                w = int(w0 * ssaa)
+            else:
+                h, w = h0, w0
+            rast, _ = dr.rasterize(glctx, uv.unsqueeze(0), ft, (h, w)) # [1, h, w, 4]
+            xyzs, _ = dr.interpolate(v.unsqueeze(0), rast, f) # [1, h, w, 3]
+            mask, _ = dr.interpolate(torch.ones_like(v[:, :1]).unsqueeze(0), rast, f) # [1, h, w, 1]
+            # masked query
+            xyzs = xyzs.view(-1, 3)
+            mask = (mask > 0).view(-1)
+            sigmas = torch.zeros(h * w, device=device, dtype=torch.float32)
+            feats = torch.zeros(h * w, 3, device=device, dtype=torch.float32)
+            if mask.any():
+                xyzs = xyzs[mask] # [M, 3]
+                # batched inference to avoid OOM
+                all_sigmas = []
+                all_feats = []
+                head = 0
+                while head < xyzs.shape[0]:
+                    tail = min(head + 640000, xyzs.shape[0])
+                    results_ = self.density(xyzs[head:tail])
+                    all_sigmas.append(results_['sigma'].float())
+                    all_feats.append(results_['albedo'].float())
+                    head += 640000
+                sigmas[mask] = torch.cat(all_sigmas, dim=0)
+                feats[mask] = torch.cat(all_feats, dim=0)
+            sigmas = sigmas.view(h, w, 1)
+            feats = feats.view(h, w, -1)
+            mask = mask.view(h, w)
+            ### alpha mask
+            # deltas = 2 * np.sqrt(3) / 1024
+            # alphas = 1 - torch.exp(-sigmas * deltas)
+            # alphas_mask = alphas > 0.5
+            # feats = feats * alphas_mask
+            # quantize [0.0, 1.0] to [0, 255]
+            feats = feats.cpu().numpy()
+            feats = (feats * 255).astype(np.uint8)
+            # alphas = alphas.cpu().numpy()
+            # alphas = (alphas * 255).astype(np.uint8)
+            ### NN search as an antialiasing ...
+            mask = mask.cpu().numpy()
+            inpaint_region = binary_dilation(mask, iterations=3)
+            inpaint_region[mask] = 0
+            search_region = mask.copy()
+            not_search_region = binary_erosion(search_region, iterations=2)
+            search_region[not_search_region] = 0
+            search_coords = np.stack(np.nonzero(search_region), axis=-1)
+            inpaint_coords = np.stack(np.nonzero(inpaint_region), axis=-1)
+            knn = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(search_coords)
+            _, indices = knn.kneighbors(inpaint_coords)
+            feats[tuple(inpaint_coords.T)] = feats[tuple(search_coords[indices[:, 0]].T)]
+            # do ssaa after the NN search, in numpy
+            feats = cv2.cvtColor(feats, cv2.COLOR_RGB2BGR)
+            if ssaa > 1:
+                # alphas = cv2.resize(alphas, (w0, h0), interpolation=cv2.INTER_NEAREST)
+                feats = cv2.resize(feats, (w0, h0), interpolation=cv2.INTER_LINEAR)
+            # cv2.imwrite(os.path.join(path, f'alpha.png'), alphas)
+            cv2.imwrite(os.path.join(path, f'{name}albedo.png'), feats)
+            # save obj (v, vt, f /)
+            obj_file = os.path.join(path, f'{name}mesh.obj')
+            mtl_file = os.path.join(path, f'{name}mesh.mtl')
+            print(f'[INFO] writing obj mesh to {obj_file}')
+            with open(obj_file, "w") as fp:
+                fp.write(f'mtllib {name}mesh.mtl \n')
+                print(f'[INFO] writing vertices {v_np.shape}')
+                for v in v_np:
+                    fp.write(f'v {v[0]} {v[1]} {v[2]} \n')
+                print(f'[INFO] writing vertices texture coords {vt_np.shape}')
+                for v in vt_np:
+                    fp.write(f'vt {v[0]} {1 - v[1]} \n')
+                print(f'[INFO] writing faces {f_np.shape}')
+                fp.write(f'usemtl mat0 \n')
+                for i in range(len(f_np)):
+                    fp.write(f"f {f_np[i, 0] + 1}/{ft_np[i, 0] + 1} {f_np[i, 1] + 1}/{ft_np[i, 1] + 1} {f_np[i, 2] + 1}/{ft_np[i, 2] + 1} \n")
+            with open(mtl_file, "w") as fp:
+                fp.write(f'newmtl mat0 \n')
+                fp.write(f'Ka 1.000000 1.000000 1.000000 \n')
+                fp.write(f'Kd 1.000000 1.000000 1.000000 \n')
+                fp.write(f'Ks 0.000000 0.000000 0.000000 \n')
+                fp.write(f'Tr 1.000000 \n')
+                fp.write(f'illum 1 \n')
+                fp.write(f'Ns 0.000000 \n')
+                fp.write(f'map_Kd {name}albedo.png \n')
+        _export(v, f)
+    def run(self, rays_o, rays_d, num_steps=128, upsample_steps=128, light_d=None, ambient_ratio=1.0, shading='albedo', bg_color=None, perturb=False, **kwargs):
+        # rays_o, rays_d: [B, N, 3], assumes B == 1
+        # bg_color: [BN, 3] in range [0, 1]
+        # return: image: [B, N, 3], depth: [B, N]
+        prefix = rays_o.shape[:-1]
+        rays_o = rays_o.contiguous().view(-1, 3)
+        rays_d = rays_d.contiguous().view(-1, 3)
+        N = rays_o.shape[0] # N = B * N, in fact
+        device = rays_o.device
+        results = {}
+        # choose aabb
+        aabb = self.aabb_train if self.training else self.aabb_infer
+        # sample steps
+        nears, fars = raymarching.near_far_from_aabb(rays_o, rays_d, aabb, self.min_near)
+        nears.unsqueeze_(-1)
+        fars.unsqueeze_(-1)
+        # random sample light_d if not provided
+        if light_d is None:
+            # gaussian noise around the ray origin, so the light always face the view dir (avoid dark face)
+            light_d = (rays_o[0] + torch.randn(3, device=device, dtype=torch.float))
+            light_d = safe_normalize(light_d)
+        #print(f'nears = {nears.min().item()} ~ {nears.max().item()}, fars = {fars.min().item()} ~ {fars.max().item()}')
+        z_vals = torch.linspace(0.0, 1.0, num_steps, device=device).unsqueeze(0) # [1, T]
+        z_vals = z_vals.expand((N, num_steps)) # [N, T]
+        z_vals = nears + (fars - nears) * z_vals # [N, T], in [nears, fars]
+        # perturb z_vals
+        sample_dist = (fars - nears) / num_steps
+        if perturb:
+            z_vals = z_vals + (torch.rand(z_vals.shape, device=device) - 0.5) * sample_dist
+            #z_vals = z_vals.clamp(nears, fars) # avoid out of bounds xyzs.
+        # generate xyzs
+        xyzs = rays_o.unsqueeze(-2) + rays_d.unsqueeze(-2) * z_vals.unsqueeze(-1) # [N, 1, 3] * [N, T, 1] -> [N, T, 3]
+        xyzs = torch.min(torch.max(xyzs, aabb[:3]), aabb[3:]) # a manual clip.
+        #plot_pointcloud(xyzs.reshape(-1, 3).detach().cpu().numpy())
+        # query SDF and RGB
+        density_outputs = self.density(xyzs.reshape(-1, 3))
+        #sigmas = density_outputs['sigma'].view(N, num_steps) # [N, T]
+        for k, v in density_outputs.items():
+            density_outputs[k] = v.view(N, num_steps, -1)
+        # upsample z_vals (nerf-like)
+        if upsample_steps > 0:
+            with torch.no_grad():
+                deltas = z_vals[..., 1:] - z_vals[..., :-1] # [N, T-1]
+                deltas = torch.cat([deltas, sample_dist * torch.ones_like(deltas[..., :1])], dim=-1)
+                alphas = 1 - torch.exp(-deltas * density_outputs['sigma'].squeeze(-1)) # [N, T]
+                alphas_shifted = torch.cat([torch.ones_like(alphas[..., :1]), 1 - alphas + 1e-15], dim=-1) # [N, T+1]
+                weights = alphas * torch.cumprod(alphas_shifted, dim=-1)[..., :-1] # [N, T]
+                # sample new z_vals
+                z_vals_mid = (z_vals[..., :-1] + 0.5 * deltas[..., :-1]) # [N, T-1]
+                new_z_vals = sample_pdf(z_vals_mid, weights[:, 1:-1], upsample_steps, det=not self.training).detach() # [N, t]
+                new_xyzs = rays_o.unsqueeze(-2) + rays_d.unsqueeze(-2) * new_z_vals.unsqueeze(-1) # [N, 1, 3] * [N, t, 1] -> [N, t, 3]
+                new_xyzs = torch.min(torch.max(new_xyzs, aabb[:3]), aabb[3:]) # a manual clip.
+            # only forward new points to save computation
+            new_density_outputs = self.density(new_xyzs.reshape(-1, 3))
+            #new_sigmas = new_density_outputs['sigma'].view(N, upsample_steps) # [N, t]
+            for k, v in new_density_outputs.items():
+                new_density_outputs[k] = v.view(N, upsample_steps, -1)
+            # re-order
+            z_vals = torch.cat([z_vals, new_z_vals], dim=1) # [N, T+t]
+            z_vals, z_index = torch.sort(z_vals, dim=1)
+            xyzs = torch.cat([xyzs, new_xyzs], dim=1) # [N, T+t, 3]
+            xyzs = torch.gather(xyzs, dim=1, index=z_index.unsqueeze(-1).expand_as(xyzs))
+            for k in density_outputs:
+                tmp_output = torch.cat([density_outputs[k], new_density_outputs[k]], dim=1)
+                density_outputs[k] = torch.gather(tmp_output, dim=1, index=z_index.unsqueeze(-1).expand_as(tmp_output))
+        deltas = z_vals[..., 1:] - z_vals[..., :-1] # [N, T+t-1]
+        deltas = torch.cat([deltas, sample_dist * torch.ones_like(deltas[..., :1])], dim=-1)
+        alphas = 1 - torch.exp(-deltas * density_outputs['sigma'].squeeze(-1)) # [N, T+t]
+        alphas_shifted = torch.cat([torch.ones_like(alphas[..., :1]), 1 - alphas + 1e-15], dim=-1) # [N, T+t+1]
+        weights = alphas * torch.cumprod(alphas_shifted, dim=-1)[..., :-1] # [N, T+t]
+        dirs = rays_d.view(-1, 1, 3).expand_as(xyzs)
+        for k, v in density_outputs.items():
+            density_outputs[k] = v.view(-1, v.shape[-1])
+        sigmas, rgbs, normals = self(xyzs.reshape(-1, 3), dirs.reshape(-1, 3), light_d, ratio=ambient_ratio, shading=shading)
+        rgbs = rgbs.view(N, -1, 3) # [N, T+t, 3]
+        #print(xyzs.shape, 'valid_rgb:', mask.sum().item())
+        # orientation loss
+        if normals is not None:
+            normals = normals.view(N, -1, 3)
+            # print(weights.shape, normals.shape, dirs.shape)
+            loss_orient = weights.detach() * (normals * dirs).sum(-1).clamp(min=0) ** 2
+            results['loss_orient'] = loss_orient.mean()
+        # calculate weight_sum (mask)
+        weights_sum = weights.sum(dim=-1) # [N]
+        # calculate depth
+        ori_z_vals = ((z_vals - nears) / (fars - nears)).clamp(0, 1)
+        depth = torch.sum(weights * ori_z_vals, dim=-1)
+        # calculate color
+        image = torch.sum(weights.unsqueeze(-1) * rgbs, dim=-2) # [N, 3], in [0, 1]
+        # mix background color
+        if self.bg_radius > 0:
+            # use the bg model to calculate bg_color
+            # sph = raymarching.sph_from_ray(rays_o, rays_d, self.bg_radius) # [N, 2] in [-1, 1]
+            bg_color = self.background(rays_d.reshape(-1, 3)) # [N, 3]
+        elif bg_color is None:
+            bg_color = 1
+        image = image + (1 - weights_sum).unsqueeze(-1) * bg_color
+        image = image.view(*prefix, 3)
+        depth = depth.view(*prefix)
+        mask = (nears < fars).reshape(*prefix)
+        results['image'] = image
+        results['depth'] = depth
+        results['weights_sum'] = weights_sum
+        results['mask'] = mask
+        return results
+    def run_cuda(self, rays_o, rays_d, dt_gamma=0, light_d=None, ambient_ratio=1.0, shading='albedo', bg_color=None, perturb=False, force_all_rays=False, max_steps=1024, T_thresh=1e-4, **kwargs):
+        # rays_o, rays_d: [B, N, 3], assumes B == 1
+        # return: image: [B, N, 3], depth: [B, N]
+        prefix = rays_o.shape[:-1]
+        rays_o = rays_o.contiguous().view(-1, 3)
+        rays_d = rays_d.contiguous().view(-1, 3)
+        N = rays_o.shape[0] # N = B * N, in fact
+        device = rays_o.device
+        # pre-calculate near far
+        nears, fars = raymarching.near_far_from_aabb(rays_o, rays_d, self.aabb_train if self.training else self.aabb_infer)
+        # random sample light_d if not provided
+        if light_d is None:
+            # gaussian noise around the ray origin, so the light always face the view dir (avoid dark face)
+            light_d = (rays_o[0] + torch.randn(3, device=device, dtype=torch.float))
+            light_d = safe_normalize(light_d)
+        results = {}
+        if self.training:
+            # setup counter
+            counter = self.step_counter[self.local_step % 16]
+            counter.zero_() # set to 0
+            self.local_step += 1
+            xyzs, dirs, deltas, rays = raymarching.march_rays_train(rays_o, rays_d, self.bound, self.density_bitfield, self.cascade, self.grid_size, nears, fars, counter, self.mean_count, perturb, 128, force_all_rays, dt_gamma, max_steps)
+            #plot_pointcloud(xyzs.reshape(-1, 3).detach().cpu().numpy())
+            sigmas, rgbs, normals = self(xyzs, dirs, light_d, ratio=ambient_ratio, shading=shading)
+            #print(f'valid RGB query ratio: {mask.sum().item() / mask.shape[0]} (total = {mask.sum().item()})')
+            weights_sum, depth, image = raymarching.composite_rays_train(sigmas, rgbs, deltas, rays, T_thresh)
+            # orientation loss
+            if normals is not None:
+                weights = 1 - torch.exp(-sigmas)
+                loss_orient = weights.detach() * (normals * dirs).sum(-1).clamp(min=0) ** 2
+                results['loss_orient'] = loss_orient.mean()
+        else:
+            # allocate outputs
+            dtype = torch.float32
+            weights_sum = torch.zeros(N, dtype=dtype, device=device)
+            depth = torch.zeros(N, dtype=dtype, device=device)
+            image = torch.zeros(N, 3, dtype=dtype, device=device)
+            n_alive = N
+            rays_alive = torch.arange(n_alive, dtype=torch.int32, device=device) # [N]
+            rays_t = nears.clone() # [N]
+            step = 0
+            while step < max_steps: # hard coded max step
+                # count alive rays
+                n_alive = rays_alive.shape[0]
+                # exit loop
+                if n_alive <= 0:
+                    break
+                # decide compact_steps
+                n_step = max(min(N // n_alive, 8), 1)
+                xyzs, dirs, deltas = raymarching.march_rays(n_alive, n_step, rays_alive, rays_t, rays_o, rays_d, self.bound, self.density_bitfield, self.cascade, self.grid_size, nears, fars, 128, perturb if step == 0 else False, dt_gamma, max_steps)
+                sigmas, rgbs, normals = self(xyzs, dirs, light_d, ratio=ambient_ratio, shading=shading)
+                raymarching.composite_rays(n_alive, n_step, rays_alive, rays_t, sigmas, rgbs, deltas, weights_sum, depth, image, T_thresh)
+                rays_alive = rays_alive[rays_alive >= 0]
+                #print(f'step = {step}, n_step = {n_step}, n_alive = {n_alive}, xyzs: {xyzs.shape}')
+                step += n_step
+        # mix background color
+        if self.bg_radius > 0:
+            # use the bg model to calculate bg_color
+            # sph = raymarching.sph_from_ray(rays_o, rays_d, self.bg_radius) # [N, 2] in [-1, 1]
+            bg_color = self.background(rays_d) # [N, 3]
+        elif bg_color is None:
+            bg_color = 1
+        image = image + (1 - weights_sum).unsqueeze(-1) * bg_color
+        image = image.view(*prefix, 3)
+        depth = torch.clamp(depth - nears, min=0) / (fars - nears)
+        depth = depth.view(*prefix)
+        weights_sum = weights_sum.reshape(*prefix)
+        mask = (nears < fars).reshape(*prefix)
+        results['image'] = image
+        results['depth'] = depth
+        results['weights_sum'] = weights_sum
+        results['mask'] = mask
+        return results
+    @torch.no_grad()
+    def update_extra_state(self, decay=0.95, S=128):
+        # call before each epoch to update extra states.
+        if not self.cuda_ray:
+            return
+        ### update density grid
+        tmp_grid = - torch.ones_like(self.density_grid)
+        X = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
+        Y = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
+        Z = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
+        for xs in X:
+            for ys in Y:
+                for zs in Z:
+                    # construct points
+                    xx, yy, zz = custom_meshgrid(xs, ys, zs)
+                    coords = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1) # [N, 3], in [0, 128)
+                    indices = raymarching.morton3D(coords).long() # [N]
+                    xyzs = 2 * coords.float() / (self.grid_size - 1) - 1 # [N, 3] in [-1, 1]
+                    # cascading
+                    for cas in range(self.cascade):
+                        bound = min(2 ** cas, self.bound)
+                        half_grid_size = bound / self.grid_size
+                        # scale to current cascade's resolution
+                        cas_xyzs = xyzs * (bound - half_grid_size)
+                        # add noise in [-hgs, hgs]
+                        cas_xyzs += (torch.rand_like(cas_xyzs) * 2 - 1) * half_grid_size
+                        # query density
+                        sigmas = self.density(cas_xyzs)['sigma'].reshape(-1).detach()
+                        # assign
+                        tmp_grid[cas, indices] = sigmas
+        # ema update
+        valid_mask = self.density_grid >= 0
+        self.density_grid[valid_mask] = torch.maximum(self.density_grid[valid_mask] * decay, tmp_grid[valid_mask])
+        self.mean_density = torch.mean(self.density_grid[valid_mask]).item()
+        self.iter_density += 1
+        # convert to bitfield
+        density_thresh = min(self.mean_density, self.density_thresh)
+        self.density_bitfield = raymarching.packbits(self.density_grid, density_thresh, self.density_bitfield)
+        ### update step counter
+        total_step = min(16, self.local_step)
+        if total_step > 0:
+            self.mean_count = int(self.step_counter[:total_step, 0].sum().item() / total_step)
+        self.local_step = 0
+        # print(f'[density grid] min={self.density_grid.min().item():.4f}, max={self.density_grid.max().item():.4f}, mean={self.mean_density:.4f}, occ_rate={(self.density_grid > density_thresh).sum() / (128**3 * self.cascade):.3f} | [step counter] mean={self.mean_count}')
+    def render(self, rays_o, rays_d, staged=False, max_ray_batch=4096, **kwargs):
+        # rays_o, rays_d: [B, N, 3], assumes B == 1
+        # return: pred_rgb: [B, N, 3]
+        if self.cuda_ray:
+            _run = self.run_cuda
+        else:
+            _run = self.run
+        B, N = rays_o.shape[:2]
+        device = rays_o.device
+        # never stage when cuda_ray
+        if staged and not self.cuda_ray:
+            depth = torch.empty((B, N), device=device)
+            image = torch.empty((B, N, 3), device=device)
+            weights_sum = torch.empty((B, N), device=device)
+            for b in range(B):
+                head = 0
+                while head < N:
+                    tail = min(head + max_ray_batch, N)
+                    results_ = _run(rays_o[b:b+1, head:tail], rays_d[b:b+1, head:tail], **kwargs)
+                    depth[b:b+1, head:tail] = results_['depth']
+                    weights_sum[b:b+1, head:tail] = results_['weights_sum']
+                    image[b:b+1, head:tail] = results_['image']
+                    head += max_ray_batch
+            results = {}
+            results['depth'] = depth
+            results['image'] = image
+            results['weights_sum'] = weights_sum
+        else:
+            results = _run(rays_o, rays_d, **kwargs)
+        return results

nerf/sd.py ADDED Viewed

	@@ -0,0 +1,203 @@

+from transformers import CLIPTextModel, CLIPTokenizer, logging
+from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
+# suppress partial model loading warning
+logging.set_verbosity_error()
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import time
+class StableDiffusion(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        try:
+            self.token = os.environ['TOKEN']
+            print(f'[INFO] loaded hugging face access token from environment variable TOKEN')
+        except FileNotFoundError as e:
+            self.token = True
+            print(f'[INFO] try to load hugging face access token from the default place, make sure you have run `huggingface-cli login`.')
+        self.device = device
+        self.num_train_timesteps = 1000
+        self.min_step = int(self.num_train_timesteps * 0.02)
+        self.max_step = int(self.num_train_timesteps * 0.98)
+        print(f'[INFO] loading stable diffusion...')
+        # 1. Load the autoencoder model which will be used to decode the latents into image space.
+        self.vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_auth_token=self.token).to(self.device)
+        # 2. Load the tokenizer and text encoder to tokenize and encode the text.
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device)
+        # 3. The UNet model for generating the latents.
+        self.unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", use_auth_token=self.token).to(self.device)
+        # 4. Create a scheduler for inference
+        self.scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=self.num_train_timesteps)
+        self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience
+        print(f'[INFO] loaded stable diffusion!')
+    def get_text_embeds(self, prompt):
+        # Tokenize text and get embeddings
+        text_input = self.tokenizer(prompt, padding='max_length', max_length=self.tokenizer.model_max_length, truncation=True, return_tensors='pt')
+        with torch.no_grad():
+            text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        # Do the same for unconditional embeddings
+        uncond_input = self.tokenizer([''] * len(prompt), padding='max_length', max_length=self.tokenizer.model_max_length, return_tensors='pt')
+        with torch.no_grad():
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+        # Cat for final embeddings
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def train_step(self, text_embeddings, pred_rgb, guidance_scale=100):
+        # interp to 512x512 to be fed into vae.
+        # _t = time.time()
+        pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False)
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: interp {time.time() - _t:.4f}s')
+        # timestep ~ U(0.02, 0.98) to avoid very high/low noise level
+        t = torch.randint(self.min_step, self.max_step + 1, [1], dtype=torch.long, device=self.device)
+        # encode image into latents with vae, requires grad!
+        # _t = time.time()
+        latents = self.encode_imgs(pred_rgb_512)
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: vae enc {time.time() - _t:.4f}s')
+        # predict the noise residual with unet, NO grad!
+        # _t = time.time()
+        with torch.no_grad():
+            # add noise
+            noise = torch.randn_like(latents)
+            latents_noisy = self.scheduler.add_noise(latents, noise, t)
+            # pred noise
+            latent_model_input = torch.cat([latents_noisy] * 2)
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: unet {time.time() - _t:.4f}s')
+        # perform guidance (high scale from paper!)
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # w(t), sigma_t^2
+        w = (1 - self.alphas[t])
+        # w = self.alphas[t] ** 0.5 * (1 - self.alphas[t])
+        grad = w * (noise_pred - noise)
+        # clip grad for stable training?
+        # grad = grad.clamp(-1, 1)
+        # manually backward, since we omitted an item in grad and cannot simply autodiff.
+        # _t = time.time()
+        latents.backward(gradient=grad, retain_graph=True)
+        # torch.cuda.synchronize(); print(f'[TIME] guiding: backward {time.time() - _t:.4f}s')
+        return 0 # dummy loss value
+    def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None):
+        if latents is None:
+            latents = torch.randn((text_embeddings.shape[0] // 2, self.unet.in_channels, height // 8, width // 8), device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps)
+        with torch.autocast('cuda'):
+            for i, t in enumerate(self.scheduler.timesteps):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latent_model_input = torch.cat([latents] * 2)
+                # predict the noise residual
+                with torch.no_grad():
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']
+                # perform guidance
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents)['prev_sample']
+        return latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        with torch.no_grad():
+            imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def encode_imgs(self, imgs):
+        # imgs: [B, 3, H, W]
+        imgs = 2 * imgs - 1
+        posterior = self.vae.encode(imgs).latent_dist
+        latents = posterior.sample() * 0.18215
+        return latents
+    def prompt_to_img(self, prompts, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None):
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        # Prompts -> text embeds
+        text_embeds = self.get_text_embeds(prompts) # [2, 77, 768]
+        # Text embeds -> img latents
+        latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale) # [1, 4, 64, 64]
+        # Img latents -> imgs
+        imgs = self.decode_latents(latents) # [1, 3, 512, 512]
+        # Img to Numpy
+        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
+        imgs = (imgs * 255).round().astype('uint8')
+        return imgs
+if __name__ == '__main__':
+    import argparse
+    import matplotlib.pyplot as plt
+    parser = argparse.ArgumentParser()
+    parser.add_argument('prompt', type=str)
+    parser.add_argument('-H', type=int, default=512)
+    parser.add_argument('-W', type=int, default=512)
+    parser.add_argument('--steps', type=int, default=50)
+    opt = parser.parse_args()
+    device = torch.device('cuda')
+    sd = StableDiffusion(device)
+    imgs = sd.prompt_to_img(opt.prompt, opt.H, opt.W, opt.steps)
+    # visualize image
+    plt.imshow(imgs[0])
+    plt.show()

nerf/utils.py ADDED Viewed

	@@ -0,0 +1,950 @@

+import os
+import glob
+import tqdm
+import math
+import imageio
+import random
+import warnings
+import tensorboardX
+import numpy as np
+import pandas as pd
+import time
+from datetime import datetime
+import cv2
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import Dataset, DataLoader
+import trimesh
+from rich.console import Console
+from torch_ema import ExponentialMovingAverage
+from packaging import version as pver
+def custom_meshgrid(*args):
+    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
+    if pver.parse(torch.__version__) < pver.parse('1.10'):
+        return torch.meshgrid(*args)
+    else:
+        return torch.meshgrid(*args, indexing='ij')
+def safe_normalize(x, eps=1e-20):
+    return x / torch.sqrt(torch.clamp(torch.sum(x * x, -1, keepdim=True), min=eps))
+@torch.cuda.amp.autocast(enabled=False)
+def get_rays(poses, intrinsics, H, W, N=-1, error_map=None):
+    ''' get rays
+    Args:
+        poses: [B, 4, 4], cam2world
+        intrinsics: [4]
+        H, W, N: int
+        error_map: [B, 128 * 128], sample probability based on training error
+    Returns:
+        rays_o, rays_d: [B, N, 3]
+        inds: [B, N]
+    '''
+    device = poses.device
+    B = poses.shape[0]
+    fx, fy, cx, cy = intrinsics
+    i, j = custom_meshgrid(torch.linspace(0, W-1, W, device=device), torch.linspace(0, H-1, H, device=device))
+    i = i.t().reshape([1, H*W]).expand([B, H*W]) + 0.5
+    j = j.t().reshape([1, H*W]).expand([B, H*W]) + 0.5
+    results = {}
+    if N > 0:
+        N = min(N, H*W)
+        if error_map is None:
+            inds = torch.randint(0, H*W, size=[N], device=device) # may duplicate
+            inds = inds.expand([B, N])
+        else:
+            # weighted sample on a low-reso grid
+            inds_coarse = torch.multinomial(error_map.to(device), N, replacement=False) # [B, N], but in [0, 128*128)
+            # map to the original resolution with random perturb.
+            inds_x, inds_y = inds_coarse // 128, inds_coarse % 128 # `//` will throw a warning in torch 1.10... anyway.
+            sx, sy = H / 128, W / 128
+            inds_x = (inds_x * sx + torch.rand(B, N, device=device) * sx).long().clamp(max=H - 1)
+            inds_y = (inds_y * sy + torch.rand(B, N, device=device) * sy).long().clamp(max=W - 1)
+            inds = inds_x * W + inds_y
+            results['inds_coarse'] = inds_coarse # need this when updating error_map
+        i = torch.gather(i, -1, inds)
+        j = torch.gather(j, -1, inds)
+        results['inds'] = inds
+    else:
+        inds = torch.arange(H*W, device=device).expand([B, H*W])
+    zs = torch.ones_like(i)
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    directions = torch.stack((xs, ys, zs), dim=-1)
+    directions = safe_normalize(directions)
+    rays_d = directions @ poses[:, :3, :3].transpose(-1, -2) # (B, N, 3)
+    rays_o = poses[..., :3, 3] # [B, 3]
+    rays_o = rays_o[..., None, :].expand_as(rays_d) # [B, N, 3]
+    results['rays_o'] = rays_o
+    results['rays_d'] = rays_d
+    return results
+def seed_everything(seed):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    #torch.backends.cudnn.deterministic = True
+    #torch.backends.cudnn.benchmark = True
+def torch_vis_2d(x, renormalize=False):
+    # x: [3, H, W] or [1, H, W] or [H, W]
+    import matplotlib.pyplot as plt
+    import numpy as np
+    import torch
+    if isinstance(x, torch.Tensor):
+        if len(x.shape) == 3:
+            x = x.permute(1,2,0).squeeze()
+        x = x.detach().cpu().numpy()
+    print(f'[torch_vis_2d] {x.shape}, {x.dtype}, {x.min()} ~ {x.max()}')
+    x = x.astype(np.float32)
+    # renormalize
+    if renormalize:
+        x = (x - x.min(axis=0, keepdims=True)) / (x.max(axis=0, keepdims=True) - x.min(axis=0, keepdims=True) + 1e-8)
+    plt.imshow(x)
+    plt.show()
+@torch.jit.script
+def linear_to_srgb(x):
+    return torch.where(x < 0.0031308, 12.92 * x, 1.055 * x ** 0.41666 - 0.055)
+@torch.jit.script
+def srgb_to_linear(x):
+    return torch.where(x < 0.04045, x / 12.92, ((x + 0.055) / 1.055) ** 2.4)
+class Trainer(object):
+    def __init__(self,
+                 name, # name of this experiment
+                 opt, # extra conf
+                 model, # network
+                 guidance, # guidance network
+                 criterion=None, # loss function, if None, assume inline implementation in train_step
+                 optimizer=None, # optimizer
+                 ema_decay=None, # if use EMA, set the decay
+                 lr_scheduler=None, # scheduler
+                 metrics=[], # metrics for evaluation, if None, use val_loss to measure performance, else use the first metric.
+                 local_rank=0, # which GPU am I
+                 world_size=1, # total num of GPUs
+                 device=None, # device to use, usually setting to None is OK. (auto choose device)
+                 mute=False, # whether to mute all print
+                 fp16=False, # amp optimize level
+                 eval_interval=1, # eval once every $ epoch
+                 max_keep_ckpt=2, # max num of saved ckpts in disk
+                 workspace='workspace', # workspace to save logs & ckpts
+                 best_mode='min', # the smaller/larger result, the better
+                 use_loss_as_metric=True, # use loss as the first metric
+                 report_metric_at_train=False, # also report metrics at training
+                 use_checkpoint="latest", # which ckpt to use at init time
+                 use_tensorboardX=True, # whether to use tensorboard for logging
+                 scheduler_update_every_step=False, # whether to call scheduler.step() after every train step
+                 ):
+        self.name = name
+        self.opt = opt
+        self.mute = mute
+        self.metrics = metrics
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.workspace = workspace
+        self.ema_decay = ema_decay
+        self.fp16 = fp16
+        self.best_mode = best_mode
+        self.use_loss_as_metric = use_loss_as_metric
+        self.report_metric_at_train = report_metric_at_train
+        self.max_keep_ckpt = max_keep_ckpt
+        self.eval_interval = eval_interval
+        self.use_checkpoint = use_checkpoint
+        self.use_tensorboardX = use_tensorboardX
+        self.time_stamp = time.strftime("%Y-%m-%d_%H-%M-%S")
+        self.scheduler_update_every_step = scheduler_update_every_step
+        self.device = device if device is not None else torch.device(f'cuda:{local_rank}' if torch.cuda.is_available() else 'cpu')
+        self.console = Console()
+        model.to(self.device)
+        if self.world_size > 1:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])
+        self.model = model
+        # guide model
+        self.guidance = guidance
+        # text prompt
+        if self.guidance is not None:
+            for p in self.guidance.parameters():
+                p.requires_grad = False
+            self.prepare_text_embeddings()
+        else:
+            self.text_z = None
+        if isinstance(criterion, nn.Module):
+            criterion.to(self.device)
+        self.criterion = criterion
+        if optimizer is None:
+            self.optimizer = optim.Adam(self.model.parameters(), lr=0.001, weight_decay=5e-4) # naive adam
+        else:
+            self.optimizer = optimizer(self.model)
+        if lr_scheduler is None:
+            self.lr_scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lambda epoch: 1) # fake scheduler
+        else:
+            self.lr_scheduler = lr_scheduler(self.optimizer)
+        if ema_decay is not None:
+            self.ema = ExponentialMovingAverage(self.model.parameters(), decay=ema_decay)
+        else:
+            self.ema = None
+        self.scaler = torch.cuda.amp.GradScaler(enabled=self.fp16)
+        # variable init
+        self.epoch = 0
+        self.global_step = 0
+        self.local_step = 0
+        self.stats = {
+            "loss": [],
+            "valid_loss": [],
+            "results": [], # metrics[0], or valid_loss
+            "checkpoints": [], # record path of saved ckpt, to automatically remove old ckpt
+            "best_result": None,
+        }
+        # auto fix
+        if len(metrics) == 0 or self.use_loss_as_metric:
+            self.best_mode = 'min'
+        # workspace prepare
+        self.log_ptr = None
+        if self.workspace is not None:
+            os.makedirs(self.workspace, exist_ok=True)
+            self.log_path = os.path.join(workspace, f"log_{self.name}.txt")
+            self.log_ptr = open(self.log_path, "a+")
+            self.ckpt_path = os.path.join(self.workspace, 'checkpoints')
+            self.best_path = f"{self.ckpt_path}/{self.name}.pth"
+            os.makedirs(self.ckpt_path, exist_ok=True)
+        self.log(f'[INFO] Trainer: {self.name} | {self.time_stamp} | {self.device} | {"fp16" if self.fp16 else "fp32"} | {self.workspace}')
+        self.log(f'[INFO] #parameters: {sum([p.numel() for p in model.parameters() if p.requires_grad])}')
+        if self.workspace is not None:
+            if self.use_checkpoint == "scratch":
+                self.log("[INFO] Training from scratch ...")
+            elif self.use_checkpoint == "latest":
+                self.log("[INFO] Loading latest checkpoint ...")
+                self.load_checkpoint()
+            elif self.use_checkpoint == "latest_model":
+                self.log("[INFO] Loading latest checkpoint (model only)...")
+                self.load_checkpoint(model_only=True)
+            elif self.use_checkpoint == "best":
+                if os.path.exists(self.best_path):
+                    self.log("[INFO] Loading best checkpoint ...")
+                    self.load_checkpoint(self.best_path)
+                else:
+                    self.log(f"[INFO] {self.best_path} not found, loading latest ...")
+                    self.load_checkpoint()
+            else: # path to ckpt
+                self.log(f"[INFO] Loading {self.use_checkpoint} ...")
+                self.load_checkpoint(self.use_checkpoint)
+    # calculate the text embs.
+    def prepare_text_embeddings(self):
+        if self.opt.text is None:
+            self.log(f"[WARN] text prompt is not provided.")
+            self.text_z = None
+            return
+        if not self.opt.dir_text:
+            self.text_z = self.guidance.get_text_embeds([self.opt.text])
+        else:
+            self.text_z = []
+            for d in ['front', 'side', 'back', 'side', 'overhead', 'bottom']:
+                text = f"{self.opt.text}, {d} view"
+                text_z = self.guidance.get_text_embeds([text])
+                self.text_z.append(text_z)
+    def __del__(self):
+        if self.log_ptr:
+            self.log_ptr.close()
+    def log(self, *args, **kwargs):
+        if self.local_rank == 0:
+            if not self.mute:
+                #print(*args)
+                self.console.print(*args, **kwargs)
+            if self.log_ptr:
+                print(*args, file=self.log_ptr)
+                self.log_ptr.flush() # write immediately to file
+    ### ------------------------------
+    def train_step(self, data):
+        rays_o = data['rays_o'] # [B, N, 3]
+        rays_d = data['rays_d'] # [B, N, 3]
+        B, N = rays_o.shape[:2]
+        H, W = data['H'], data['W']
+        # TODO: shading is not working right now...
+        if self.global_step < self.opt.albedo_iters:
+            shading = 'albedo'
+            ambient_ratio = 1.0
+        else:
+            rand = random.random()
+            if rand > 0.8:
+                shading = 'albedo'
+                ambient_ratio = 1.0
+            # elif rand > 0.4:
+            #     shading = 'textureless'
+            #     ambient_ratio = 0.1
+            else:
+                shading = 'lambertian'
+                ambient_ratio = 0.1
+        # _t = time.time()
+        bg_color = torch.rand((B * N, 3), device=rays_o.device) # pixel-wise random
+        outputs = self.model.render(rays_o, rays_d, staged=False, perturb=True, bg_color=bg_color, ambient_ratio=ambient_ratio, shading=shading, force_all_rays=True, **vars(self.opt))
+        pred_rgb = outputs['image'].reshape(B, H, W, 3).permute(0, 3, 1, 2).contiguous() # [1, 3, H, W]
+        # torch.cuda.synchronize(); print(f'[TIME] nerf render {time.time() - _t:.4f}s')
+        # print(shading)
+        # torch_vis_2d(pred_rgb[0])
+        # text embeddings
+        if self.opt.dir_text:
+            dirs = data['dir'] # [B,]
+            text_z = self.text_z[dirs]
+        else:
+            text_z = self.text_z
+        # encode pred_rgb to latents
+        # _t = time.time()
+        loss = self.guidance.train_step(text_z, pred_rgb)
+        # torch.cuda.synchronize(); print(f'[TIME] total guiding {time.time() - _t:.4f}s')
+        # occupancy loss
+        pred_ws = outputs['weights_sum'].reshape(B, 1, H, W)
+        if self.opt.lambda_opacity > 0:
+            loss_opacity = (pred_ws ** 2).mean()
+            loss = loss + self.opt.lambda_opacity * loss_opacity
+        if self.opt.lambda_entropy > 0:
+            alphas = (pred_ws).clamp(1e-5, 1 - 1e-5)
+            # alphas = alphas ** 2 # skewed entropy, favors 0 over 1
+            loss_entropy = (- alphas * torch.log2(alphas) - (1 - alphas) * torch.log2(1 - alphas)).mean()
+            loss = loss + self.opt.lambda_entropy * loss_entropy
+        if self.opt.lambda_orient > 0 and 'loss_orient' in outputs:
+            loss_orient = outputs['loss_orient']
+            loss = loss + self.opt.lambda_orient * loss_orient
+        return pred_rgb, pred_ws, loss
+    def eval_step(self, data):
+        rays_o = data['rays_o'] # [B, N, 3]
+        rays_d = data['rays_d'] # [B, N, 3]
+        B, N = rays_o.shape[:2]
+        H, W = data['H'], data['W']
+        shading = data['shading'] if 'shading' in data else 'albedo'
+        ambient_ratio = data['ambient_ratio'] if 'ambient_ratio' in data else 1.0
+        light_d = data['light_d'] if 'light_d' in data else None
+        outputs = self.model.render(rays_o, rays_d, staged=True, perturb=False, bg_color=None, light_d=light_d, ambient_ratio=ambient_ratio, shading=shading, force_all_rays=True, **vars(self.opt))
+        pred_rgb = outputs['image'].reshape(B, H, W, 3)
+        pred_depth = outputs['depth'].reshape(B, H, W)
+        pred_ws = outputs['weights_sum'].reshape(B, H, W)
+        # mask_ws = outputs['mask'].reshape(B, H, W) # near < far
+        # loss_ws = pred_ws.sum() / mask_ws.sum()
+        # loss_ws = pred_ws.mean()
+        alphas = (pred_ws).clamp(1e-5, 1 - 1e-5)
+        # alphas = alphas ** 2 # skewed entropy, favors 0 over 1
+        loss_entropy = (- alphas * torch.log2(alphas) - (1 - alphas) * torch.log2(1 - alphas)).mean()
+        loss = self.opt.lambda_entropy * loss_entropy
+        return pred_rgb, pred_depth, loss
+    def test_step(self, data, bg_color=None, perturb=False):
+        rays_o = data['rays_o'] # [B, N, 3]
+        rays_d = data['rays_d'] # [B, N, 3]
+        B, N = rays_o.shape[:2]
+        H, W = data['H'], data['W']
+        if bg_color is not None:
+            bg_color = bg_color.to(rays_o.device)
+        else:
+            bg_color = torch.ones(3, device=rays_o.device) # [3]
+        shading = data['shading'] if 'shading' in data else 'albedo'
+        ambient_ratio = data['ambient_ratio'] if 'ambient_ratio' in data else 1.0
+        light_d = data['light_d'] if 'light_d' in data else None
+        outputs = self.model.render(rays_o, rays_d, staged=True, perturb=perturb, light_d=light_d, ambient_ratio=ambient_ratio, shading=shading, force_all_rays=True, bg_color=bg_color, **vars(self.opt))
+        pred_rgb = outputs['image'].reshape(B, H, W, 3)
+        pred_depth = outputs['depth'].reshape(B, H, W)
+        return pred_rgb, pred_depth
+    def save_mesh(self, save_path=None, resolution=128):
+        if save_path is None:
+            save_path = os.path.join(self.workspace, 'mesh')
+        self.log(f"==> Saving mesh to {save_path}")
+        os.makedirs(save_path, exist_ok=True)
+        self.model.export_mesh(save_path, resolution=resolution)
+        self.log(f"==> Finished saving mesh.")
+    ### ------------------------------
+    def train(self, train_loader, valid_loader, max_epochs):
+        assert self.text_z is not None, 'Training must provide a text prompt!'
+        if self.use_tensorboardX and self.local_rank == 0:
+            self.writer = tensorboardX.SummaryWriter(os.path.join(self.workspace, "run", self.name))
+        start_t = time.time()
+        for epoch in range(self.epoch + 1, max_epochs + 1):
+            self.epoch = epoch
+            self.train_one_epoch(train_loader)
+            if self.workspace is not None and self.local_rank == 0:
+                self.save_checkpoint(full=True, best=False)
+            if self.epoch % self.eval_interval == 0:
+                self.evaluate_one_epoch(valid_loader)
+                self.save_checkpoint(full=False, best=True)
+        end_t = time.time()
+        self.log(f"[INFO] training takes {(end_t - start_t)/ 60:.4f} minutes.")
+        if self.use_tensorboardX and self.local_rank == 0:
+            self.writer.close()
+    def evaluate(self, loader, name=None):
+        self.use_tensorboardX, use_tensorboardX = False, self.use_tensorboardX
+        self.evaluate_one_epoch(loader, name)
+        self.use_tensorboardX = use_tensorboardX
+    def test(self, loader, save_path=None, name=None, write_video=True):
+        if save_path is None:
+            save_path = os.path.join(self.workspace, 'results')
+        if name is None:
+            name = f'{self.name}_ep{self.epoch:04d}'
+        os.makedirs(save_path, exist_ok=True)
+        self.log(f"==> Start Test, save results to {save_path}")
+        pbar = tqdm.tqdm(total=len(loader) * loader.batch_size, bar_format='{percentage:3.0f}% {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')
+        self.model.eval()
+        if write_video:
+            all_preds = []
+            all_preds_depth = []
+        with torch.no_grad():
+            for i, data in enumerate(loader):
+                with torch.cuda.amp.autocast(enabled=self.fp16):
+                    preds, preds_depth = self.test_step(data)
+                pred = preds[0].detach().cpu().numpy()
+                pred = (pred * 255).astype(np.uint8)
+                pred_depth = preds_depth[0].detach().cpu().numpy()
+                pred_depth = (pred_depth * 255).astype(np.uint8)
+                if write_video:
+                    all_preds.append(pred)
+                    all_preds_depth.append(pred_depth)
+                else:
+                    cv2.imwrite(os.path.join(save_path, f'{name}_{i:04d}_rgb.png'), cv2.cvtColor(pred, cv2.COLOR_RGB2BGR))
+                    cv2.imwrite(os.path.join(save_path, f'{name}_{i:04d}_depth.png'), pred_depth)
+                pbar.update(loader.batch_size)
+        if write_video:
+            all_preds = np.stack(all_preds, axis=0)
+            all_preds_depth = np.stack(all_preds_depth, axis=0)
+            imageio.mimwrite(os.path.join(save_path, f'{name}_rgb.mp4'), all_preds, fps=25, quality=8, macro_block_size=1)
+            imageio.mimwrite(os.path.join(save_path, f'{name}_depth.mp4'), all_preds_depth, fps=25, quality=8, macro_block_size=1)
+        self.log(f"==> Finished Test.")
+    # [GUI] train text step.
+    def train_gui(self, train_loader, step=16):
+        self.model.train()
+        total_loss = torch.tensor([0], dtype=torch.float32, device=self.device)
+        loader = iter(train_loader)
+        for _ in range(step):
+            # mimic an infinite loop dataloader (in case the total dataset is smaller than step)
+            try:
+                data = next(loader)
+            except StopIteration:
+                loader = iter(train_loader)
+                data = next(loader)
+            # update grid every 16 steps
+            if self.model.cuda_ray and self.global_step % self.opt.update_extra_interval == 0:
+                with torch.cuda.amp.autocast(enabled=self.fp16):
+                    self.model.update_extra_state()
+            self.global_step += 1
+            self.optimizer.zero_grad()
+            with torch.cuda.amp.autocast(enabled=self.fp16):
+                pred_rgbs, pred_ws, loss = self.train_step(data)
+            self.scaler.scale(loss).backward()
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+            if self.scheduler_update_every_step:
+                self.lr_scheduler.step()
+            total_loss += loss.detach()
+        if self.ema is not None:
+            self.ema.update()
+        average_loss = total_loss.item() / step
+        if not self.scheduler_update_every_step:
+            if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+                self.lr_scheduler.step(average_loss)
+            else:
+                self.lr_scheduler.step()
+        outputs = {
+            'loss': average_loss,
+            'lr': self.optimizer.param_groups[0]['lr'],
+        }
+        return outputs
+    # [GUI] test on a single image
+    def test_gui(self, pose, intrinsics, W, H, bg_color=None, spp=1, downscale=1, light_d=None, ambient_ratio=1.0, shading='albedo'):
+        # render resolution (may need downscale to for better frame rate)
+        rH = int(H * downscale)
+        rW = int(W * downscale)
+        intrinsics = intrinsics * downscale
+        pose = torch.from_numpy(pose).unsqueeze(0).to(self.device)
+        rays = get_rays(pose, intrinsics, rH, rW, -1)
+        # from degree theta/phi to 3D normalized vec
+        light_d = np.deg2rad(light_d)
+        light_d = np.array([
+            np.sin(light_d[0]) * np.sin(light_d[1]),
+            np.cos(light_d[0]),
+            np.sin(light_d[0]) * np.cos(light_d[1]),
+        ], dtype=np.float32)
+        light_d = torch.from_numpy(light_d).to(self.device)
+        data = {
+            'rays_o': rays['rays_o'],
+            'rays_d': rays['rays_d'],
+            'H': rH,
+            'W': rW,
+            'light_d': light_d,
+            'ambient_ratio': ambient_ratio,
+            'shading': shading,
+        }
+        self.model.eval()
+        if self.ema is not None:
+            self.ema.store()
+            self.ema.copy_to()
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=self.fp16):
+                # here spp is used as perturb random seed!
+                preds, preds_depth = self.test_step(data, bg_color=bg_color, perturb=spp)
+        if self.ema is not None:
+            self.ema.restore()
+        # interpolation to the original resolution
+        if downscale != 1:
+            # have to permute twice with torch...
+            preds = F.interpolate(preds.permute(0, 3, 1, 2), size=(H, W), mode='nearest').permute(0, 2, 3, 1).contiguous()
+            preds_depth = F.interpolate(preds_depth.unsqueeze(1), size=(H, W), mode='nearest').squeeze(1)
+        outputs = {
+            'image': preds[0].detach().cpu().numpy(),
+            'depth': preds_depth[0].detach().cpu().numpy(),
+        }
+        return outputs
+    def train_one_epoch(self, loader):
+        self.log(f"==> Start Training {self.workspace} Epoch {self.epoch}, lr={self.optimizer.param_groups[0]['lr']:.6f} ...")
+        total_loss = 0
+        if self.local_rank == 0 and self.report_metric_at_train:
+            for metric in self.metrics:
+                metric.clear()
+        self.model.train()
+        # distributedSampler: must call set_epoch() to shuffle indices across multiple epochs
+        # ref: https://pytorch.org/docs/stable/data.html
+        if self.world_size > 1:
+            loader.sampler.set_epoch(self.epoch)
+        if self.local_rank == 0:
+            pbar = tqdm.tqdm(total=len(loader) * loader.batch_size, bar_format='{desc}: {percentage:3.0f}% {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')
+        self.local_step = 0
+        for data in loader:
+            # update grid every 16 steps
+            if self.model.cuda_ray and self.global_step % self.opt.update_extra_interval == 0:
+                with torch.cuda.amp.autocast(enabled=self.fp16):
+                    self.model.update_extra_state()
+            self.local_step += 1
+            self.global_step += 1
+            self.optimizer.zero_grad()
+            with torch.cuda.amp.autocast(enabled=self.fp16):
+                pred_rgbs, pred_ws, loss = self.train_step(data)
+            self.scaler.scale(loss).backward()
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+            if self.scheduler_update_every_step:
+                self.lr_scheduler.step()
+            loss_val = loss.item()
+            total_loss += loss_val
+            if self.local_rank == 0:
+                # if self.report_metric_at_train:
+                #     for metric in self.metrics:
+                #         metric.update(preds, truths)
+                if self.use_tensorboardX:
+                    self.writer.add_scalar("train/loss", loss_val, self.global_step)
+                    self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]['lr'], self.global_step)
+                if self.scheduler_update_every_step:
+                    pbar.set_description(f"loss={loss_val:.4f} ({total_loss/self.local_step:.4f}), lr={self.optimizer.param_groups[0]['lr']:.6f}")
+                else:
+                    pbar.set_description(f"loss={loss_val:.4f} ({total_loss/self.local_step:.4f})")
+                pbar.update(loader.batch_size)
+        if self.ema is not None:
+            self.ema.update()
+        average_loss = total_loss / self.local_step
+        self.stats["loss"].append(average_loss)
+        if self.local_rank == 0:
+            pbar.close()
+            if self.report_metric_at_train:
+                for metric in self.metrics:
+                    self.log(metric.report(), style="red")
+                    if self.use_tensorboardX:
+                        metric.write(self.writer, self.epoch, prefix="train")
+                    metric.clear()
+        if not self.scheduler_update_every_step:
+            if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+                self.lr_scheduler.step(average_loss)
+            else:
+                self.lr_scheduler.step()
+        self.log(f"==> Finished Epoch {self.epoch}.")
+    def evaluate_one_epoch(self, loader, name=None):
+        self.log(f"++> Evaluate {self.workspace} at epoch {self.epoch} ...")
+        if name is None:
+            name = f'{self.name}_ep{self.epoch:04d}'
+        total_loss = 0
+        if self.local_rank == 0:
+            for metric in self.metrics:
+                metric.clear()
+        self.model.eval()
+        if self.ema is not None:
+            self.ema.store()
+            self.ema.copy_to()
+        if self.local_rank == 0:
+            pbar = tqdm.tqdm(total=len(loader) * loader.batch_size, bar_format='{desc}: {percentage:3.0f}% {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]')
+        with torch.no_grad():
+            self.local_step = 0
+            for data in loader:
+                self.local_step += 1
+                with torch.cuda.amp.autocast(enabled=self.fp16):
+                    preds, preds_depth, loss = self.eval_step(data)
+                # all_gather/reduce the statistics (NCCL only support all_*)
+                if self.world_size > 1:
+                    dist.all_reduce(loss, op=dist.ReduceOp.SUM)
+                    loss = loss / self.world_size
+                    preds_list = [torch.zeros_like(preds).to(self.device) for _ in range(self.world_size)] # [[B, ...], [B, ...], ...]
+                    dist.all_gather(preds_list, preds)
+                    preds = torch.cat(preds_list, dim=0)
+                    preds_depth_list = [torch.zeros_like(preds_depth).to(self.device) for _ in range(self.world_size)] # [[B, ...], [B, ...], ...]
+                    dist.all_gather(preds_depth_list, preds_depth)
+                    preds_depth = torch.cat(preds_depth_list, dim=0)
+                loss_val = loss.item()
+                total_loss += loss_val
+                # only rank = 0 will perform evaluation.
+                if self.local_rank == 0:
+                    # save image
+                    save_path = os.path.join(self.workspace, 'validation', f'{name}_{self.local_step:04d}_rgb.png')
+                    save_path_depth = os.path.join(self.workspace, 'validation', f'{name}_{self.local_step:04d}_depth.png')
+                    #self.log(f"==> Saving validation image to {save_path}")
+                    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+                    pred = preds[0].detach().cpu().numpy()
+                    pred = (pred * 255).astype(np.uint8)
+                    pred_depth = preds_depth[0].detach().cpu().numpy()
+                    pred_depth = (pred_depth * 255).astype(np.uint8)
+                    cv2.imwrite(save_path, cv2.cvtColor(pred, cv2.COLOR_RGB2BGR))
+                    cv2.imwrite(save_path_depth, pred_depth)
+                    pbar.set_description(f"loss={loss_val:.4f} ({total_loss/self.local_step:.4f})")
+                    pbar.update(loader.batch_size)
+        average_loss = total_loss / self.local_step
+        self.stats["valid_loss"].append(average_loss)
+        if self.local_rank == 0:
+            pbar.close()
+            if not self.use_loss_as_metric and len(self.metrics) > 0:
+                result = self.metrics[0].measure()
+                self.stats["results"].append(result if self.best_mode == 'min' else - result) # if max mode, use -result
+            else:
+                self.stats["results"].append(average_loss) # if no metric, choose best by min loss
+            for metric in self.metrics:
+                self.log(metric.report(), style="blue")
+                if self.use_tensorboardX:
+                    metric.write(self.writer, self.epoch, prefix="evaluate")
+                metric.clear()
+        if self.ema is not None:
+            self.ema.restore()
+        self.log(f"++> Evaluate epoch {self.epoch} Finished.")
+    def save_checkpoint(self, name=None, full=False, best=False):
+        if name is None:
+            name = f'{self.name}_ep{self.epoch:04d}'
+        state = {
+            'epoch': self.epoch,
+            'global_step': self.global_step,
+            'stats': self.stats,
+        }
+        if self.model.cuda_ray:
+            state['mean_count'] = self.model.mean_count
+            state['mean_density'] = self.model.mean_density
+        if full:
+            state['optimizer'] = self.optimizer.state_dict()
+            state['lr_scheduler'] = self.lr_scheduler.state_dict()
+            state['scaler'] = self.scaler.state_dict()
+            if self.ema is not None:
+                state['ema'] = self.ema.state_dict()
+        if not best:
+            state['model'] = self.model.state_dict()
+            file_path = f"{name}.pth"
+            self.stats["checkpoints"].append(file_path)
+            if len(self.stats["checkpoints"]) > self.max_keep_ckpt:
+                old_ckpt = os.path.join(self.ckpt_path, self.stats["checkpoints"].pop(0))
+                if os.path.exists(old_ckpt):
+                    os.remove(old_ckpt)
+            torch.save(state, os.path.join(self.ckpt_path, file_path))
+        else:
+            if len(self.stats["results"]) > 0:
+                if self.stats["best_result"] is None or self.stats["results"][-1] < self.stats["best_result"]:
+                    self.log(f"[INFO] New best result: {self.stats['best_result']} --> {self.stats['results'][-1]}")
+                    self.stats["best_result"] = self.stats["results"][-1]
+                    # save ema results
+                    if self.ema is not None:
+                        self.ema.store()
+                        self.ema.copy_to()
+                    state['model'] = self.model.state_dict()
+                    if self.ema is not None:
+                        self.ema.restore()
+                    torch.save(state, self.best_path)
+            else:
+                self.log(f"[WARN] no evaluated results found, skip saving best checkpoint.")
+    def load_checkpoint(self, checkpoint=None, model_only=False):
+        if checkpoint is None:
+            checkpoint_list = sorted(glob.glob(f'{self.ckpt_path}/*.pth'))
+            if checkpoint_list:
+                checkpoint = checkpoint_list[-1]
+                self.log(f"[INFO] Latest checkpoint is {checkpoint}")
+            else:
+                self.log("[WARN] No checkpoint found, model randomly initialized.")
+                return
+        checkpoint_dict = torch.load(checkpoint, map_location=self.device)
+        if 'model' not in checkpoint_dict:
+            self.model.load_state_dict(checkpoint_dict)
+            self.log("[INFO] loaded model.")
+            return
+        missing_keys, unexpected_keys = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
+        self.log("[INFO] loaded model.")
+        if len(missing_keys) > 0:
+            self.log(f"[WARN] missing keys: {missing_keys}")
+        if len(unexpected_keys) > 0:
+            self.log(f"[WARN] unexpected keys: {unexpected_keys}")
+        if self.ema is not None and 'ema' in checkpoint_dict:
+            try:
+                self.ema.load_state_dict(checkpoint_dict['ema'])
+                self.log("[INFO] loaded EMA.")
+            except:
+                self.log("[WARN] failed to loaded EMA.")
+        if self.model.cuda_ray:
+            if 'mean_count' in checkpoint_dict:
+                self.model.mean_count = checkpoint_dict['mean_count']
+            if 'mean_density' in checkpoint_dict:
+                self.model.mean_density = checkpoint_dict['mean_density']
+        if model_only:
+            return
+        self.stats = checkpoint_dict['stats']
+        self.epoch = checkpoint_dict['epoch']
+        self.global_step = checkpoint_dict['global_step']
+        self.log(f"[INFO] load at epoch {self.epoch}, global step {self.global_step}")
+        if self.optimizer and 'optimizer' in checkpoint_dict:
+            try:
+                self.optimizer.load_state_dict(checkpoint_dict['optimizer'])
+                self.log("[INFO] loaded optimizer.")
+            except:
+                self.log("[WARN] Failed to load optimizer.")
+        if self.lr_scheduler and 'lr_scheduler' in checkpoint_dict:
+            try:
+                self.lr_scheduler.load_state_dict(checkpoint_dict['lr_scheduler'])
+                self.log("[INFO] loaded scheduler.")
+            except:
+                self.log("[WARN] Failed to load scheduler.")
+        if self.scaler and 'scaler' in checkpoint_dict:
+            try:
+                self.scaler.load_state_dict(checkpoint_dict['scaler'])
+                self.log("[INFO] loaded scaler.")
+            except:
+                self.log("[WARN] Failed to load scaler.")

optimizer.py ADDED Viewed

	@@ -0,0 +1,470 @@

+import numpy as np
+import torch
+import enum
+import itertools
+from dataclasses import dataclass
+import torch.optim as optim
+@torch.no_grad()
+def PowerIter(mat_g, error_tolerance=1e-6, num_iters=100):
+  """Power iteration.
+  Compute the maximum eigenvalue of mat, for scaling.
+  v is a random vector with values in (-1, 1)
+  Args:
+    mat_g: the symmetric PSD matrix.
+    error_tolerance: Iterative exit condition.
+    num_iters: Number of iterations.
+  Returns:
+    eigen vector, eigen value, num_iters
+  """
+  v = torch.rand(list(mat_g.shape)[0], device=mat_g.get_device()) * 2 - 1
+  error = 1
+  iters = 0
+  singular_val = 0
+  while error > error_tolerance and iters < num_iters:
+    v = v / torch.norm(v)
+    mat_v = torch.mv(mat_g, v)
+    s_v = torch.dot(v, mat_v)
+    error = torch.abs(s_v - singular_val)
+    v = mat_v
+    singular_val = s_v
+    iters += 1
+  return singular_val, v / torch.norm(v), iters
+@torch.no_grad()
+def MatPower(mat_m, p):
+  """Computes mat_m^p, for p a positive integer.
+  Args:
+    mat_m: a square matrix
+    p: a positive integer
+  Returns:
+    mat_m^p
+  """
+  if p in [1, 2, 4, 8, 16, 32]:
+    p_done = 1
+    res = mat_m
+    while p_done < p:
+      res = torch.matmul(res, res)
+      p_done *= 2
+    return res
+  power = None
+  while p > 0:
+    if p % 2 == 1:
+      power = torch.matmul(mat_m, power) if power is not None else mat_m
+    p //= 2
+    mat_m = torch.matmul(mat_m, mat_m)
+  return power
+@torch.no_grad()
+def ComputePower(mat_g, p,
+                 iter_count=100,
+                 error_tolerance=1e-6,
+                 ridge_epsilon=1e-6):
+  """A method to compute G^{-1/p} using a coupled Newton iteration.
+  See for example equation 3.2 on page 9 of:
+  A Schur-Newton Method for the Matrix p-th Root and its Inverse
+  by Chun-Hua Guo and Nicholas J. Higham
+  SIAM Journal on Matrix Analysis and Applications,
+  2006, Vol. 28, No. 3 : pp. 788-804
+  https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf
+  Args:
+    mat_g: A square positive semidefinite matrix
+    p: a positive integer
+    iter_count: Stop iterating after this many rounds.
+    error_tolerance: Threshold for stopping iteration
+    ridge_epsilon: We add this times I to G, to make is positive definite.
+                   For scaling, we multiply it by the largest eigenvalue of G.
+  Returns:
+    (mat_g + rI)^{-1/p} (r = ridge_epsilon * max_eigenvalue of mat_g).
+  """
+  shape = list(mat_g.shape)
+  if len(shape) == 1:
+    return torch.pow(mat_g + ridge_epsilon, -1/p)
+  identity = torch.eye(shape[0], device=mat_g.get_device())
+  if shape[0] == 1:
+    return identity
+  alpha = -1.0/p
+  max_ev, _, _ = PowerIter(mat_g)
+  ridge_epsilon *= max_ev
+  mat_g += ridge_epsilon * identity
+  z = (1 + p) / (2 * torch.norm(mat_g))
+  # The best value for z is
+  # (1 + p) * (c_max^{1/p} - c_min^{1/p}) /
+  #            (c_max^{1+1/p} - c_min^{1+1/p})
+  # where c_max and c_min are the largest and smallest singular values of
+  # mat_g.
+  # The above estimate assumes that c_max > c_min * 2^p
+  # Can replace above line by the one below, but it is less accurate,
+  # hence needs more iterations to converge.
+  # z = (1 + p) / tf.trace(mat_g)
+  # If we want the method to always converge, use z = 1 / norm(mat_g)
+  # or z = 1 / tf.trace(mat_g), but these can result in many
+  # extra iterations.
+  mat_root = identity * torch.pow(z, 1.0/p)
+  mat_m = mat_g * z
+  error = torch.max(torch.abs(mat_m - identity))
+  count = 0
+  while error > error_tolerance and count < iter_count:
+    tmp_mat_m = (1 - alpha) * identity + alpha * mat_m
+    new_mat_root = torch.matmul(mat_root, tmp_mat_m)
+    mat_m = torch.matmul(MatPower(tmp_mat_m, p), mat_m)
+    new_error = torch.max(torch.abs(mat_m - identity))
+    if new_error > error * 1.2:
+      break
+    mat_root = new_mat_root
+    error = new_error
+    count += 1
+  return mat_root
+# Grafting is a technique to fix the layerwise scale of Shampoo optimizer.
+# https://arxiv.org/pdf/2002.11803.pdf studies this in detail. This
+# allows us to plugin the Shampoo optimizer into settings where SGD/AdaGrad
+# is already well tuned. Grafting onto Shampoo means take the Shampoo direction,
+# but use the step magnitude from the grafted optimizer such as Adagrad or SGD.
+class LayerwiseGrafting(enum.IntEnum):
+  NONE = 0
+  SGD = 1
+  ADAGRAD = 2
+@dataclass
+class ShampooHyperParams:
+  """Shampoo hyper parameters."""
+  beta2: float = 0.9
+  diagonal_eps: float = 1e-6
+  matrix_eps: float = 1e-12
+  weight_decay: float = 0.0
+  inverse_exponent_override: int = 2  # fixed exponent for preconditioner, if >0
+  start_preconditioning_step: int = 1
+  # Performance tuning params for controlling memory and compute requirements.
+  # How often to compute preconditioner.
+  preconditioning_compute_steps: int = 1
+  # How often to compute statistics.
+  statistics_compute_steps: int = 1
+  # Block size for large layers (if > 0).
+  # Block size = 1 ==> Adagrad (Don't do this, extremely inefficient!)
+  # Block size should be as large as feasible under memory/time constraints.
+  block_size: int = 128
+  # Automatic shape interpretation (for eg: [4, 3, 1024, 512] would result in
+  # 12 x [1024, 512] L and R statistics. Disabled by default which results in
+  # Shampoo constructing statistics [4, 4], [3, 3], [1024, 1024], [512, 512].
+  best_effort_shape_interpretation: bool = True
+  # Type of grafting (SGD or AdaGrad).
+  # https://arxiv.org/pdf/2002.11803.pdf
+  graft_type: int = LayerwiseGrafting.ADAGRAD
+  # Nesterov momentum
+  nesterov: bool = True
+class Graft:
+  """Base class to perform grafting onto Shampoo. This class does no grafting.
+  """
+  def __init__(self, hps, unused_var):
+    self.hps = hps
+  def add_statistics(self, grad):
+    pass
+  def precondition_gradient(self, grad):
+    return grad
+  def update_momentum(self, update, unused_beta1):
+    return update
+class SGDGraft(Graft):
+  """Graft using SGD+momentum.
+  momentum maintains an exponentially weighted moving average of gradients.
+  """
+  def __init__(self, hps, var):
+    super(SGDGraft, self).__init__(hps, var)
+    self.momentum = torch.zeros_like(var.data, device=var.get_device())
+  def update_momentum(self, update, beta1):
+    self.momentum.mul_(beta1).add_(update)
+    return self.momentum
+class AdagradGraft(SGDGraft):
+  """Graft using Adagrad.
+  Essentially an implementation of Adagrad with momentum.
+  """
+  def __init__(self, hps, var):
+    super(AdagradGraft, self).__init__(hps, var)
+    self.statistics = torch.zeros_like(var.data, device=var.get_device())
+  def add_statistics(self, grad):
+    self.statistics.add_(grad * grad)
+  def precondition_gradient(self, grad):
+    return grad / (torch.sqrt(self.statistics) + self.hps.diagonal_eps)
+class BlockPartitioner:
+  """Partitions a tensor into smaller tensors for preconditioning.
+    For example, if a variable has shape (4096, 512), we might split the
+    4096 into 4 blocks, so we effectively have 4 variables of size
+    (1024, 512) each.
+  """
+  def __init__(self, var, hps):
+    self._shape = var.shape
+    self._splits = []
+    self._split_sizes = []
+    split_sizes = []
+    # We split var into smaller blocks. Here we store the metadata to make
+    # that split.
+    for i, d in enumerate(var.shape):
+      if hps.block_size > 0 and d > hps.block_size:
+        # d-1, otherwise split appends a 0-size array.
+        nsplit = (d-1) // hps.block_size
+        indices = (np.arange(nsplit, dtype=np.int32) + 1) * hps.block_size
+        sizes = np.ones(nsplit + 1, dtype=np.int32) * hps.block_size
+        sizes[-1] = d - indices[-1]
+        self._splits.append((i, indices))
+        self._split_sizes.append((i, sizes))
+        split_sizes.append(sizes)
+      else:
+        split_sizes.append(np.array([d], dtype=np.int32))
+    self._num_splits = len(split_sizes)
+    self._preconditioner_shapes = []
+    for t in itertools.product(*split_sizes):
+      self._preconditioner_shapes.extend([[d, d] for d in t])
+  def shapes_for_preconditioners(self):
+    return self._preconditioner_shapes
+  def num_splits(self):
+    return self._num_splits
+  def partition(self, tensor):
+    """Partition tensor into blocks."""
+    assert tensor.shape == self._shape
+    tensors = [tensor]
+    for (i, sizes) in self._split_sizes:
+      tensors_local = []
+      for t in tensors:
+        tensors_local.extend(
+            torch.split(t, tuple(sizes), dim=i))
+      tensors = tensors_local
+    return tensors
+  def merge_partitions(self, partitions):
+    """Merge partitions back to original shape."""
+    for (i, indices) in reversed(self._splits):
+      n = len(indices) + 1
+      partial_merged_tensors = []
+      ind = 0
+      while ind < len(partitions):
+        partial_merged_tensors.append(
+            torch.cat(partitions[ind:ind + n], axis=i))
+        ind += n
+      partitions = partial_merged_tensors
+    assert len(partitions) == 1
+    return partitions[0]
+def _merge_small_dims(shape_to_merge, max_dim):
+  """Merge small dimensions.
+  If there are some small dimensions, we collapse them:
+  e.g. [1, 2, 512, 1, 2048, 1, 3, 4] --> [1024, 2048, 12] if max_dim = 1024
+       [1, 2, 768, 1, 2048] --> [2, 768, 2048]
+  Args:
+    shape_to_merge: Shape to merge small dimensions.
+    max_dim: Maximal dimension of output shape used in merging.
+  Returns:
+    Merged shape.
+  """
+  resulting_shape = []
+  product = 1
+  for d in shape_to_merge:
+    if product * d <= max_dim:
+      product *= d
+    else:
+      if product > 1:
+        resulting_shape.append(product)
+      product = d
+  if product > 1:
+    resulting_shape.append(product)
+  return resulting_shape
+class Preconditioner:
+  """Compute statistics/shape from gradients for preconditioning."""
+  def __init__(self, var, hps):
+    self._hps = hps
+    self._original_shape = var.shape
+    self._transformed_shape = var.shape
+    if hps.best_effort_shape_interpretation:
+      self._transformed_shape = _merge_small_dims(
+          self._original_shape, hps.block_size)
+    reshaped_var = torch.reshape(var, self._transformed_shape)
+    self._partitioner = BlockPartitioner(reshaped_var, hps)
+    shapes = self._partitioner.shapes_for_preconditioners()
+    rank = len(self._transformed_shape)
+    device = var.get_device()
+    if rank <= 1:
+      self.statistics = []
+      self.preconditioners = []
+    else:
+      eps = self._hps.matrix_eps
+      self.statistics = [eps * torch.eye(s[0], device=device) for s in shapes]
+      self.preconditioners = [torch.eye(s[0], device=device) for s in shapes]
+  def add_statistics(self, grad):
+    """Compute statistics from gradients and add to the correct state entries.
+    Args:
+      grad: Gradient to compute statistics from.
+    """
+    if not self.statistics: return
+    reshaped_grad = torch.reshape(grad, self._transformed_shape)
+    partitioned_grads = self._partitioner.partition(reshaped_grad)
+    w1 = self._hps.beta2
+    w2 = 1.0 if w1 == 1.0 else (1.0 - w1)
+    rank = len(self._transformed_shape)
+    for j, grad in enumerate(partitioned_grads):
+      for i in range(rank):
+        axes = list(range(i)) + list(range(i + 1, rank))
+        stat = torch.tensordot(grad, grad, [axes, axes])
+        self.statistics[j*rank + i].mul_(w1).add_(stat, alpha=w2)
+  def exponent_for_preconditioner(self):
+    """Returns exponent to use for inverse-pth root M^{-1/p}."""
+    if self._hps.inverse_exponent_override > 0:
+      return self._hps.inverse_exponent_override
+    return 2 * len(self._transformed_shape)
+  def compute_preconditioners(self):
+    """Compute L^{-1/exp} for each stats matrix L."""
+    exp = self.exponent_for_preconditioner()
+    eps = self._hps.matrix_eps
+    for i, stat in enumerate(self.statistics):
+      self.preconditioners[i] = ComputePower(
+          stat, exp, ridge_epsilon=eps)
+  def preconditioned_grad(self, grad):
+    """Precondition the gradient.
+    Args:
+      grad: A gradient tensor to precondition.
+    Returns:
+      A preconditioned gradient.
+    """
+    if not self.preconditioners: return grad
+    reshaped_grad = torch.reshape(grad, self._transformed_shape)
+    partitioned_grads = self._partitioner.partition(reshaped_grad)
+    preconditioned_partitioned_grads = []
+    num_splits = self._partitioner.num_splits()
+    for i, grad in enumerate(partitioned_grads):
+      preconditioners_for_grad = self.preconditioners[i * num_splits:(i + 1) *
+                                                      num_splits]
+      rank = len(grad.shape)
+      precond_grad = grad
+      for j in range(rank):
+        preconditioner = preconditioners_for_grad[j]
+        precond_grad = torch.tensordot(
+            precond_grad, preconditioner, [[0], [0]])
+      preconditioned_partitioned_grads.append(precond_grad)
+    merged_grad = self._partitioner.merge_partitions(
+        preconditioned_partitioned_grads)
+    return torch.reshape(merged_grad, self._original_shape)
+STEP = 'step'
+MOMENTUM = 'momentum'
+PRECONDITIONER = 'preconditioner'
+GRAFT = 'graft'
+class Shampoo(optim.Optimizer):
+  """The Shampoo optimizer."""
+  def __init__(self,
+               params,
+               lr=1.0,
+               momentum=0.9,
+               hyperparams=ShampooHyperParams()):
+    defaults = dict(lr=lr, momentum=momentum)
+    self.hps = hyperparams
+    super(Shampoo, self).__init__(params, defaults)
+  def init_var_state(self, var, state):
+    """Initialize the PyTorch state of for a single variable."""
+    state[STEP] = 0
+    state[MOMENTUM] = torch.zeros_like(var.data, device=var.get_device())
+    state[PRECONDITIONER] = Preconditioner(var, self.hps)
+    if self.hps.graft_type == LayerwiseGrafting.ADAGRAD:
+      state[GRAFT] = AdagradGraft(self.hps, var)
+    elif self.hps.graft_type == LayerwiseGrafting.SGD:
+      state[GRAFT] = SGDGraft(self.hps, var)
+    else:
+      state[GRAFT] = Graft(self.hps, var)
+  def step(self, closure=None):
+    hps = self.hps
+    for group in self.param_groups:
+      lr = group['lr']
+      for p in group['params']:
+        if p.grad is None: continue
+        grad = p.grad.data
+        if grad.is_sparse:
+          raise RuntimeError('Shampoo does not support sparse yet')
+        state = self.state[p]
+        if not state:
+          self.init_var_state(p, state)
+        state[STEP] += 1
+        preconditioner = state[PRECONDITIONER]
+        graft = state[GRAFT]
+        # Gather statistics, compute preconditioners
+        graft.add_statistics(grad)
+        if state[STEP] % hps.statistics_compute_steps == 0:
+          preconditioner.add_statistics(grad)
+        if state[STEP] % hps.preconditioning_compute_steps == 0:
+          preconditioner.compute_preconditioners()
+        # Precondition gradients
+        graft_grad = graft.precondition_gradient(grad)
+        shampoo_grad = grad
+        if state[STEP] >= self.hps.start_preconditioning_step:
+          shampoo_grad = preconditioner.preconditioned_grad(grad)
+        # Grafting
+        graft_norm = torch.norm(graft_grad)
+        shampoo_norm = torch.norm(shampoo_grad)
+        shampoo_grad.mul_(graft_norm / (shampoo_norm + 1e-16))
+        # Weight decay
+        if self.hps.weight_decay != 0.0:
+          shampoo_grad.add_(p.data, alpha=self.hps.weight_decay)
+          graft_grad.add_(p.data, alpha=self.hps.weight_decay)
+        # Momentum and Nesterov momentum, if needed
+        state[MOMENTUM].mul_(group['momentum']).add_(shampoo_grad)
+        graft_momentum = graft.update_momentum(grad, group['momentum'])
+        if state[STEP] >= self.hps.start_preconditioning_step:
+          momentum_update = state[MOMENTUM]
+          wd_update = shampoo_grad
+        else:
+          momentum_update = graft_momentum
+          wd_update = graft_grad
+        if hps.nesterov:
+          momentum_update.mul_(group['momentum']).add_(wd_update)
+        # Final update
+        p.data.add_(momentum_update, alpha=-lr)

raymarching/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .raymarching import *

raymarching/backend.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from torch.utils.cpp_extension import load
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+_backend = load(name='_raymarching',
+                extra_cflags=c_flags,
+                extra_cuda_cflags=nvcc_flags,
+                sources=[os.path.join(_src_path, 'src', f) for f in [
+                    'raymarching.cu',
+                    'bindings.cpp',
+                ]],
+                )
+__all__ = ['_backend']

raymarching/raymarching.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import numpy as np
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.cuda.amp import custom_bwd, custom_fwd
+try:
+    import _raymarching as _backend
+except ImportError:
+    from .backend import _backend
+# ----------------------------------------
+# utils
+# ----------------------------------------
+class _near_far_from_aabb(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, rays_o, rays_d, aabb, min_near=0.2):
+        ''' near_far_from_aabb, CUDA implementation
+        Calculate rays' intersection time (near and far) with aabb
+        Args:
+            rays_o: float, [N, 3]
+            rays_d: float, [N, 3]
+            aabb: float, [6], (xmin, ymin, zmin, xmax, ymax, zmax)
+            min_near: float, scalar
+        Returns:
+            nears: float, [N]
+            fars: float, [N]
+        '''
+        if not rays_o.is_cuda: rays_o = rays_o.cuda()
+        if not rays_d.is_cuda: rays_d = rays_d.cuda()
+        rays_o = rays_o.contiguous().view(-1, 3)
+        rays_d = rays_d.contiguous().view(-1, 3)
+        N = rays_o.shape[0] # num rays
+        nears = torch.empty(N, dtype=rays_o.dtype, device=rays_o.device)
+        fars = torch.empty(N, dtype=rays_o.dtype, device=rays_o.device)
+        _backend.near_far_from_aabb(rays_o, rays_d, aabb, N, min_near, nears, fars)
+        return nears, fars
+near_far_from_aabb = _near_far_from_aabb.apply
+class _sph_from_ray(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, rays_o, rays_d, radius):
+        ''' sph_from_ray, CUDA implementation
+        get spherical coordinate on the background sphere from rays.
+        Assume rays_o are inside the Sphere(radius).
+        Args:
+            rays_o: [N, 3]
+            rays_d: [N, 3]
+            radius: scalar, float
+        Return:
+            coords: [N, 2], in [-1, 1], theta and phi on a sphere. (further-surface)
+        '''
+        if not rays_o.is_cuda: rays_o = rays_o.cuda()
+        if not rays_d.is_cuda: rays_d = rays_d.cuda()
+        rays_o = rays_o.contiguous().view(-1, 3)
+        rays_d = rays_d.contiguous().view(-1, 3)
+        N = rays_o.shape[0] # num rays
+        coords = torch.empty(N, 2, dtype=rays_o.dtype, device=rays_o.device)
+        _backend.sph_from_ray(rays_o, rays_d, radius, N, coords)
+        return coords
+sph_from_ray = _sph_from_ray.apply
+class _morton3D(Function):
+    @staticmethod
+    def forward(ctx, coords):
+        ''' morton3D, CUDA implementation
+        Args:
+            coords: [N, 3], int32, in [0, 128) (for some reason there is no uint32 tensor in torch...)
+            TODO: check if the coord range is valid! (current 128 is safe)
+        Returns:
+            indices: [N], int32, in [0, 128^3)
+        '''
+        if not coords.is_cuda: coords = coords.cuda()
+        N = coords.shape[0]
+        indices = torch.empty(N, dtype=torch.int32, device=coords.device)
+        _backend.morton3D(coords.int(), N, indices)
+        return indices
+morton3D = _morton3D.apply
+class _morton3D_invert(Function):
+    @staticmethod
+    def forward(ctx, indices):
+        ''' morton3D_invert, CUDA implementation
+        Args:
+            indices: [N], int32, in [0, 128^3)
+        Returns:
+            coords: [N, 3], int32, in [0, 128)
+        '''
+        if not indices.is_cuda: indices = indices.cuda()
+        N = indices.shape[0]
+        coords = torch.empty(N, 3, dtype=torch.int32, device=indices.device)
+        _backend.morton3D_invert(indices.int(), N, coords)
+        return coords
+morton3D_invert = _morton3D_invert.apply
+class _packbits(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, grid, thresh, bitfield=None):
+        ''' packbits, CUDA implementation
+        Pack up the density grid into a bit field to accelerate ray marching.
+        Args:
+            grid: float, [C, H * H * H], assume H % 2 == 0
+            thresh: float, threshold
+        Returns:
+            bitfield: uint8, [C, H * H * H / 8]
+        '''
+        if not grid.is_cuda: grid = grid.cuda()
+        grid = grid.contiguous()
+        C = grid.shape[0]
+        H3 = grid.shape[1]
+        N = C * H3 // 8
+        if bitfield is None:
+            bitfield = torch.empty(N, dtype=torch.uint8, device=grid.device)
+        _backend.packbits(grid, N, thresh, bitfield)
+        return bitfield
+packbits = _packbits.apply
+# ----------------------------------------
+# train functions
+# ----------------------------------------
+class _march_rays_train(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, rays_o, rays_d, bound, density_bitfield, C, H, nears, fars, step_counter=None, mean_count=-1, perturb=False, align=-1, force_all_rays=False, dt_gamma=0, max_steps=1024):
+        ''' march rays to generate points (forward only)
+        Args:
+            rays_o/d: float, [N, 3]
+            bound: float, scalar
+            density_bitfield: uint8: [CHHH // 8]
+            C: int
+            H: int
+            nears/fars: float, [N]
+            step_counter: int32, (2), used to count the actual number of generated points.
+            mean_count: int32, estimated mean steps to accelerate training. (but will randomly drop rays if the actual point count exceeded this threshold.)
+            perturb: bool
+            align: int, pad output so its size is dividable by align, set to -1 to disable.
+            force_all_rays: bool, ignore step_counter and mean_count, always calculate all rays. Useful if rendering the whole image, instead of some rays.
+            dt_gamma: float, called cone_angle in instant-ngp, exponentially accelerate ray marching if > 0. (very significant effect, but generally lead to worse performance)
+            max_steps: int, max number of sampled points along each ray, also affect min_stepsize.
+        Returns:
+            xyzs: float, [M, 3], all generated points' coords. (all rays concated, need to use `rays` to extract points belonging to each ray)
+            dirs: float, [M, 3], all generated points' view dirs.
+            deltas: float, [M, 2], all generated points' deltas. (first for RGB, second for Depth)
+            rays: int32, [N, 3], all rays' (index, point_offset, point_count), e.g., xyzs[rays[i, 1]:rays[i, 2]] --> points belonging to rays[i, 0]
+        '''
+        if not rays_o.is_cuda: rays_o = rays_o.cuda()
+        if not rays_d.is_cuda: rays_d = rays_d.cuda()
+        if not density_bitfield.is_cuda: density_bitfield = density_bitfield.cuda()
+        rays_o = rays_o.contiguous().view(-1, 3)
+        rays_d = rays_d.contiguous().view(-1, 3)
+        density_bitfield = density_bitfield.contiguous()
+        N = rays_o.shape[0] # num rays
+        M = N * max_steps # init max points number in total
+        # running average based on previous epoch (mimic `measured_batch_size_before_compaction` in instant-ngp)
+        # It estimate the max points number to enable faster training, but will lead to random ignored rays if underestimated.
+        if not force_all_rays and mean_count > 0:
+            if align > 0:
+                mean_count += align - mean_count % align
+            M = mean_count
+        xyzs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
+        dirs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
+        deltas = torch.zeros(M, 2, dtype=rays_o.dtype, device=rays_o.device)
+        rays = torch.empty(N, 3, dtype=torch.int32, device=rays_o.device) # id, offset, num_steps
+        if step_counter is None:
+            step_counter = torch.zeros(2, dtype=torch.int32, device=rays_o.device) # point counter, ray counter
+        if perturb:
+            noises = torch.rand(N, dtype=rays_o.dtype, device=rays_o.device)
+        else:
+            noises = torch.zeros(N, dtype=rays_o.dtype, device=rays_o.device)
+        _backend.march_rays_train(rays_o, rays_d, density_bitfield, bound, dt_gamma, max_steps, N, C, H, M, nears, fars, xyzs, dirs, deltas, rays, step_counter, noises) # m is the actually used points number
+        #print(step_counter, M)
+        # only used at the first (few) epochs.
+        if force_all_rays or mean_count <= 0:
+            m = step_counter[0].item() # D2H copy
+            if align > 0:
+                m += align - m % align
+            xyzs = xyzs[:m]
+            dirs = dirs[:m]
+            deltas = deltas[:m]
+            torch.cuda.empty_cache()
+        return xyzs, dirs, deltas, rays
+march_rays_train = _march_rays_train.apply
+class _composite_rays_train(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, sigmas, rgbs, deltas, rays, T_thresh=1e-4):
+        ''' composite rays' rgbs, according to the ray marching formula.
+        Args:
+            rgbs: float, [M, 3]
+            sigmas: float, [M,]
+            deltas: float, [M, 2]
+            rays: int32, [N, 3]
+        Returns:
+            weights_sum: float, [N,], the alpha channel
+            depth: float, [N, ], the Depth
+            image: float, [N, 3], the RGB channel (after multiplying alpha!)
+        '''
+        sigmas = sigmas.contiguous()
+        rgbs = rgbs.contiguous()
+        M = sigmas.shape[0]
+        N = rays.shape[0]
+        weights_sum = torch.empty(N, dtype=sigmas.dtype, device=sigmas.device)
+        depth = torch.empty(N, dtype=sigmas.dtype, device=sigmas.device)
+        image = torch.empty(N, 3, dtype=sigmas.dtype, device=sigmas.device)
+        _backend.composite_rays_train_forward(sigmas, rgbs, deltas, rays, M, N, T_thresh, weights_sum, depth, image)
+        ctx.save_for_backward(sigmas, rgbs, deltas, rays, weights_sum, depth, image)
+        ctx.dims = [M, N, T_thresh]
+        return weights_sum, depth, image
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_weights_sum, grad_depth, grad_image):
+        # NOTE: grad_depth is not used now! It won't be propagated to sigmas.
+        grad_weights_sum = grad_weights_sum.contiguous()
+        grad_image = grad_image.contiguous()
+        sigmas, rgbs, deltas, rays, weights_sum, depth, image = ctx.saved_tensors
+        M, N, T_thresh = ctx.dims
+        grad_sigmas = torch.zeros_like(sigmas)
+        grad_rgbs = torch.zeros_like(rgbs)
+        _backend.composite_rays_train_backward(grad_weights_sum, grad_image, sigmas, rgbs, deltas, rays, weights_sum, image, M, N, T_thresh, grad_sigmas, grad_rgbs)
+        return grad_sigmas, grad_rgbs, None, None, None
+composite_rays_train = _composite_rays_train.apply
+# ----------------------------------------
+# infer functions
+# ----------------------------------------
+class _march_rays(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, n_alive, n_step, rays_alive, rays_t, rays_o, rays_d, bound, density_bitfield, C, H, near, far, align=-1, perturb=False, dt_gamma=0, max_steps=1024):
+        ''' march rays to generate points (forward only, for inference)
+        Args:
+            n_alive: int, number of alive rays
+            n_step: int, how many steps we march
+            rays_alive: int, [N], the alive rays' IDs in N (N >= n_alive, but we only use first n_alive)
+            rays_t: float, [N], the alive rays' time, we only use the first n_alive.
+            rays_o/d: float, [N, 3]
+            bound: float, scalar
+            density_bitfield: uint8: [CHHH // 8]
+            C: int
+            H: int
+            nears/fars: float, [N]
+            align: int, pad output so its size is dividable by align, set to -1 to disable.
+            perturb: bool/int, int > 0 is used as the random seed.
+            dt_gamma: float, called cone_angle in instant-ngp, exponentially accelerate ray marching if > 0. (very significant effect, but generally lead to worse performance)
+            max_steps: int, max number of sampled points along each ray, also affect min_stepsize.
+        Returns:
+            xyzs: float, [n_alive * n_step, 3], all generated points' coords
+            dirs: float, [n_alive * n_step, 3], all generated points' view dirs.
+            deltas: float, [n_alive * n_step, 2], all generated points' deltas (here we record two deltas, the first is for RGB, the second for depth).
+        '''
+        if not rays_o.is_cuda: rays_o = rays_o.cuda()
+        if not rays_d.is_cuda: rays_d = rays_d.cuda()
+        rays_o = rays_o.contiguous().view(-1, 3)
+        rays_d = rays_d.contiguous().view(-1, 3)
+        M = n_alive * n_step
+        if align > 0:
+            M += align - (M % align)
+        xyzs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
+        dirs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
+        deltas = torch.zeros(M, 2, dtype=rays_o.dtype, device=rays_o.device) # 2 vals, one for rgb, one for depth
+        if perturb:
+            # torch.manual_seed(perturb) # test_gui uses spp index as seed
+            noises = torch.rand(n_alive, dtype=rays_o.dtype, device=rays_o.device)
+        else:
+            noises = torch.zeros(n_alive, dtype=rays_o.dtype, device=rays_o.device)
+        _backend.march_rays(n_alive, n_step, rays_alive, rays_t, rays_o, rays_d, bound, dt_gamma, max_steps, C, H, density_bitfield, near, far, xyzs, dirs, deltas, noises)
+        return xyzs, dirs, deltas
+march_rays = _march_rays.apply
+class _composite_rays(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32) # need to cast sigmas & rgbs to float
+    def forward(ctx, n_alive, n_step, rays_alive, rays_t, sigmas, rgbs, deltas, weights_sum, depth, image, T_thresh=1e-2):
+        ''' composite rays' rgbs, according to the ray marching formula. (for inference)
+        Args:
+            n_alive: int, number of alive rays
+            n_step: int, how many steps we march
+            rays_alive: int, [n_alive], the alive rays' IDs in N (N >= n_alive)
+            rays_t: float, [N], the alive rays' time
+            sigmas: float, [n_alive * n_step,]
+            rgbs: float, [n_alive * n_step, 3]
+            deltas: float, [n_alive * n_step, 2], all generated points' deltas (here we record two deltas, the first is for RGB, the second for depth).
+        In-place Outputs:
+            weights_sum: float, [N,], the alpha channel
+            depth: float, [N,], the depth value
+            image: float, [N, 3], the RGB channel (after multiplying alpha!)
+        '''
+        _backend.composite_rays(n_alive, n_step, T_thresh, rays_alive, rays_t, sigmas, rgbs, deltas, weights_sum, depth, image)
+        return tuple()
+composite_rays = _composite_rays.apply

raymarching/setup.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+'''
+Usage:
+python setup.py build_ext --inplace # build extensions locally, do not install (only can be used from the parent directory)
+python setup.py install # build extensions and install (copy) to PATH.
+pip install . # ditto but better (e.g., dependency & metadata handling)
+python setup.py develop # build extensions and install (symbolic) to PATH.
+pip install -e . # ditto but better (e.g., dependency & metadata handling)
+'''
+setup(
+    name='raymarching', # package name, import this to use python API
+    ext_modules=[
+        CUDAExtension(
+            name='_raymarching', # extension name, import this to use CUDA API
+            sources=[os.path.join(_src_path, 'src', f) for f in [
+                'raymarching.cu',
+                'bindings.cpp',
+            ]],
+            extra_compile_args={
+                'cxx': c_flags,
+                'nvcc': nvcc_flags,
+            }
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension,
+    }
+)

raymarching/src/bindings.cpp ADDED Viewed

	@@ -0,0 +1,19 @@

+#include <torch/extension.h>
+#include "raymarching.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // utils
+    m.def("packbits", &packbits, "packbits (CUDA)");
+    m.def("near_far_from_aabb", &near_far_from_aabb, "near_far_from_aabb (CUDA)");
+    m.def("sph_from_ray", &sph_from_ray, "sph_from_ray (CUDA)");
+    m.def("morton3D", &morton3D, "morton3D (CUDA)");
+    m.def("morton3D_invert", &morton3D_invert, "morton3D_invert (CUDA)");
+    // train
+    m.def("march_rays_train", &march_rays_train, "march_rays_train (CUDA)");
+    m.def("composite_rays_train_forward", &composite_rays_train_forward, "composite_rays_train_forward (CUDA)");
+    m.def("composite_rays_train_backward", &composite_rays_train_backward, "composite_rays_train_backward (CUDA)");
+    // infer
+    m.def("march_rays", &march_rays, "march rays (CUDA)");
+    m.def("composite_rays", &composite_rays, "composite rays (CUDA)");
+}

raymarching/src/raymarching.cu ADDED Viewed

	@@ -0,0 +1,914 @@

+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/torch.h>
+#include <cstdio>
+#include <stdint.h>
+#include <stdexcept>
+#include <limits>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor")
+#define CHECK_IS_FLOATING(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float || x.scalar_type() == at::ScalarType::Half || x.scalar_type() == at::ScalarType::Double, #x " must be a floating tensor")
+inline constexpr __device__ float SQRT3() { return 1.7320508075688772f; }
+inline constexpr __device__ float RSQRT3() { return 0.5773502691896258f; }
+inline constexpr __device__ float PI() { return 3.141592653589793f; }
+inline constexpr __device__ float RPI() { return 0.3183098861837907f; }
+template <typename T>
+inline __host__ __device__ T div_round_up(T val, T divisor) {
+    return (val + divisor - 1) / divisor;
+}
+inline __host__ __device__ float signf(const float x) {
+    return copysignf(1.0, x);
+}
+inline __host__ __device__ float clamp(const float x, const float min, const float max) {
+    return fminf(max, fmaxf(min, x));
+}
+inline __host__ __device__ void swapf(float& a, float& b) {
+    float c = a; a = b; b = c;
+}
+inline __device__ int mip_from_pos(const float x, const float y, const float z, const float max_cascade) {
+    const float mx = fmaxf(fabsf(x), fmaxf(fabs(y), fabs(z)));
+    int exponent;
+    frexpf(mx, &exponent); // [0, 0.5) --> -1, [0.5, 1) --> 0, [1, 2) --> 1, [2, 4) --> 2, ...
+    return fminf(max_cascade - 1, fmaxf(0, exponent));
+}
+inline __device__ int mip_from_dt(const float dt, const float H, const float max_cascade) {
+    const float mx = dt * H * 0.5;
+    int exponent;
+    frexpf(mx, &exponent);
+    return fminf(max_cascade - 1, fmaxf(0, exponent));
+}
+inline __host__ __device__ uint32_t __expand_bits(uint32_t v)
+{
+	v = (v * 0x00010001u) & 0xFF0000FFu;
+	v = (v * 0x00000101u) & 0x0F00F00Fu;
+	v = (v * 0x00000011u) & 0xC30C30C3u;
+	v = (v * 0x00000005u) & 0x49249249u;
+	return v;
+}
+inline __host__ __device__ uint32_t __morton3D(uint32_t x, uint32_t y, uint32_t z)
+{
+	uint32_t xx = __expand_bits(x);
+	uint32_t yy = __expand_bits(y);
+	uint32_t zz = __expand_bits(z);
+	return xx | (yy << 1) | (zz << 2);
+}
+inline __host__ __device__ uint32_t __morton3D_invert(uint32_t x)
+{
+	x = x & 0x49249249;
+	x = (x | (x >> 2)) & 0xc30c30c3;
+	x = (x | (x >> 4)) & 0x0f00f00f;
+	x = (x | (x >> 8)) & 0xff0000ff;
+	x = (x | (x >> 16)) & 0x0000ffff;
+	return x;
+}
+////////////////////////////////////////////////////
+/////////////           utils          /////////////
+////////////////////////////////////////////////////
+// rays_o/d: [N, 3]
+// nears/fars: [N]
+// scalar_t should always be float in use.
+template <typename scalar_t>
+__global__ void kernel_near_far_from_aabb(
+    const scalar_t * __restrict__ rays_o,
+    const scalar_t * __restrict__ rays_d,
+    const scalar_t * __restrict__ aabb,
+    const uint32_t N,
+    const float min_near,
+    scalar_t * nears, scalar_t * fars
+) {
+    // parallel per ray
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= N) return;
+    // locate
+    rays_o += n * 3;
+    rays_d += n * 3;
+    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
+    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
+    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
+    // get near far (assume cube scene)
+    float near = (aabb[0] - ox) * rdx;
+    float far = (aabb[3] - ox) * rdx;
+    if (near > far) swapf(near, far);
+    float near_y = (aabb[1] - oy) * rdy;
+    float far_y = (aabb[4] - oy) * rdy;
+    if (near_y > far_y) swapf(near_y, far_y);
+    if (near > far_y || near_y > far) {
+        nears[n] = fars[n] = std::numeric_limits<scalar_t>::max();
+        return;
+    }
+    if (near_y > near) near = near_y;
+    if (far_y < far) far = far_y;
+    float near_z = (aabb[2] - oz) * rdz;
+    float far_z = (aabb[5] - oz) * rdz;
+    if (near_z > far_z) swapf(near_z, far_z);
+    if (near > far_z || near_z > far) {
+        nears[n] = fars[n] = std::numeric_limits<scalar_t>::max();
+        return;
+    }
+    if (near_z > near) near = near_z;
+    if (far_z < far) far = far_z;
+    if (near < min_near) near = min_near;
+    nears[n] = near;
+    fars[n] = far;
+}
+void near_far_from_aabb(const at::Tensor rays_o, const at::Tensor rays_d, const at::Tensor aabb, const uint32_t N, const float min_near, at::Tensor nears, at::Tensor fars) {
+    static constexpr uint32_t N_THREAD = 128;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    rays_o.scalar_type(), "near_far_from_aabb", ([&] {
+        kernel_near_far_from_aabb<<<div_round_up(N, N_THREAD), N_THREAD>>>(rays_o.data_ptr<scalar_t>(), rays_d.data_ptr<scalar_t>(), aabb.data_ptr<scalar_t>(), N, min_near, nears.data_ptr<scalar_t>(), fars.data_ptr<scalar_t>());
+    }));
+}
+// rays_o/d: [N, 3]
+// radius: float
+// coords: [N, 2]
+template <typename scalar_t>
+__global__ void kernel_sph_from_ray(
+    const scalar_t * __restrict__ rays_o,
+    const scalar_t * __restrict__ rays_d,
+    const float radius,
+    const uint32_t N,
+    scalar_t * coords
+) {
+    // parallel per ray
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= N) return;
+    // locate
+    rays_o += n * 3;
+    rays_d += n * 3;
+    coords += n * 2;
+    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
+    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
+    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
+    // solve t from || o + td || = radius
+    const float A = dx * dx + dy * dy + dz * dz;
+    const float B = ox * dx + oy * dy + oz * dz; // in fact B / 2
+    const float C = ox * ox + oy * oy + oz * oz - radius * radius;
+    const float t = (- B + sqrtf(B * B - A * C)) / A; // always use the larger solution (positive)
+    // solve theta, phi (assume y is the up axis)
+    const float x = ox + t * dx, y = oy + t * dy, z = oz + t * dz;
+    const float theta = atan2(sqrtf(x * x + z * z), y); // [0, PI)
+    const float phi = atan2(z, x); // [-PI, PI)
+    // normalize to [-1, 1]
+    coords[0] = 2 * theta * RPI() - 1;
+    coords[1] = phi * RPI();
+}
+void sph_from_ray(const at::Tensor rays_o, const at::Tensor rays_d, const float radius, const uint32_t N, at::Tensor coords) {
+    static constexpr uint32_t N_THREAD = 128;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    rays_o.scalar_type(), "sph_from_ray", ([&] {
+        kernel_sph_from_ray<<<div_round_up(N, N_THREAD), N_THREAD>>>(rays_o.data_ptr<scalar_t>(), rays_d.data_ptr<scalar_t>(), radius, N, coords.data_ptr<scalar_t>());
+    }));
+}
+// coords: int32, [N, 3]
+// indices: int32, [N]
+__global__ void kernel_morton3D(
+    const int * __restrict__ coords,
+    const uint32_t N,
+    int * indices
+) {
+    // parallel
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= N) return;
+    // locate
+    coords += n * 3;
+    indices[n] = __morton3D(coords[0], coords[1], coords[2]);
+}
+void morton3D(const at::Tensor coords, const uint32_t N, at::Tensor indices) {
+    static constexpr uint32_t N_THREAD = 128;
+    kernel_morton3D<<<div_round_up(N, N_THREAD), N_THREAD>>>(coords.data_ptr<int>(), N, indices.data_ptr<int>());
+}
+// indices: int32, [N]
+// coords: int32, [N, 3]
+__global__ void kernel_morton3D_invert(
+    const int * __restrict__ indices,
+    const uint32_t N,
+    int * coords
+) {
+    // parallel
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= N) return;
+    // locate
+    coords += n * 3;
+    const int ind = indices[n];
+    coords[0] = __morton3D_invert(ind >> 0);
+    coords[1] = __morton3D_invert(ind >> 1);
+    coords[2] = __morton3D_invert(ind >> 2);
+}
+void morton3D_invert(const at::Tensor indices, const uint32_t N, at::Tensor coords) {
+    static constexpr uint32_t N_THREAD = 128;
+    kernel_morton3D_invert<<<div_round_up(N, N_THREAD), N_THREAD>>>(indices.data_ptr<int>(), N, coords.data_ptr<int>());
+}
+// grid: float, [C, H, H, H]
+// N: int, C * H * H * H / 8
+// density_thresh: float
+// bitfield: uint8, [N]
+template <typename scalar_t>
+__global__ void kernel_packbits(
+    const scalar_t * __restrict__ grid,
+    const uint32_t N,
+    const float density_thresh,
+    uint8_t * bitfield
+) {
+    // parallel per byte
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= N) return;
+    // locate
+    grid += n * 8;
+    uint8_t bits = 0;
+    #pragma unroll
+    for (uint8_t i = 0; i < 8; i++) {
+        bits |= (grid[i] > density_thresh) ? ((uint8_t)1 << i) : 0;
+    }
+    bitfield[n] = bits;
+}
+void packbits(const at::Tensor grid, const uint32_t N, const float density_thresh, at::Tensor bitfield) {
+    static constexpr uint32_t N_THREAD = 128;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    grid.scalar_type(), "packbits", ([&] {
+        kernel_packbits<<<div_round_up(N, N_THREAD), N_THREAD>>>(grid.data_ptr<scalar_t>(), N, density_thresh, bitfield.data_ptr<uint8_t>());
+    }));
+}
+////////////////////////////////////////////////////
+/////////////         training         /////////////
+////////////////////////////////////////////////////
+// rays_o/d: [N, 3]
+// grid: [CHHH / 8]
+// xyzs, dirs, deltas: [M, 3], [M, 3], [M, 2]
+// dirs: [M, 3]
+// rays: [N, 3], idx, offset, num_steps
+template <typename scalar_t>
+__global__ void kernel_march_rays_train(
+    const scalar_t * __restrict__ rays_o,
+    const scalar_t * __restrict__ rays_d,
+    const uint8_t * __restrict__ grid,
+    const float bound,
+    const float dt_gamma, const uint32_t max_steps,
+    const uint32_t N, const uint32_t C, const uint32_t H, const uint32_t M,
+    const scalar_t* __restrict__ nears,
+    const scalar_t* __restrict__ fars,
+    scalar_t * xyzs, scalar_t * dirs, scalar_t * deltas,
+    int * rays,
+    int * counter,
+    const scalar_t* __restrict__ noises
+) {
+    // parallel per ray
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= N) return;
+    // locate
+    rays_o += n * 3;
+    rays_d += n * 3;
+    // ray marching
+    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
+    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
+    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
+    const float rH = 1 / (float)H;
+    const float H3 = H * H * H;
+    const float near = nears[n];
+    const float far = fars[n];
+    const float noise = noises[n];
+    const float dt_min = 2 * SQRT3() / max_steps;
+    const float dt_max = 2 * SQRT3() * (1 << (C - 1)) / H;
+    float t0 = near;
+    // perturb
+    t0 += clamp(t0 * dt_gamma, dt_min, dt_max) * noise;
+    // first pass: estimation of num_steps
+    float t = t0;
+    uint32_t num_steps = 0;
+    //if (t < far) printf("valid ray %d t=%f near=%f far=%f \n", n, t, near, far);
+    while (t < far && num_steps < max_steps) {
+        // current point
+        const float x = clamp(ox + t * dx, -bound, bound);
+        const float y = clamp(oy + t * dy, -bound, bound);
+        const float z = clamp(oz + t * dz, -bound, bound);
+        const float dt = clamp(t * dt_gamma, dt_min, dt_max);
+        // get mip level
+        const int level = max(mip_from_pos(x, y, z, C), mip_from_dt(dt, H, C)); // range in [0, C - 1]
+        const float mip_bound = fminf(scalbnf(1.0f, level), bound);
+        const float mip_rbound = 1 / mip_bound;
+        // convert to nearest grid position
+        const int nx = clamp(0.5 * (x * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        const int ny = clamp(0.5 * (y * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        const int nz = clamp(0.5 * (z * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        const uint32_t index = level * H3 + __morton3D(nx, ny, nz);
+        const bool occ = grid[index / 8] & (1 << (index % 8));
+        // if occpuied, advance a small step, and write to output
+        //if (n == 0) printf("t=%f density=%f vs thresh=%f step=%d\n", t, density, density_thresh, num_steps);
+        if (occ) {
+            num_steps++;
+            t += dt;
+        // else, skip a large step (basically skip a voxel grid)
+        } else {
+            // calc distance to next voxel
+            const float tx = (((nx + 0.5f + 0.5f * signf(dx)) * rH * 2 - 1) * mip_bound - x) * rdx;
+            const float ty = (((ny + 0.5f + 0.5f * signf(dy)) * rH * 2 - 1) * mip_bound - y) * rdy;
+            const float tz = (((nz + 0.5f + 0.5f * signf(dz)) * rH * 2 - 1) * mip_bound - z) * rdz;
+            const float tt = t + fmaxf(0.0f, fminf(tx, fminf(ty, tz)));
+            // step until next voxel
+            do {
+                t += clamp(t * dt_gamma, dt_min, dt_max);
+            } while (t < tt);
+        }
+    }
+    //printf("[n=%d] num_steps=%d, near=%f, far=%f, dt=%f, max_steps=%f\n", n, num_steps, near, far, dt_min, (far - near) / dt_min);
+    // second pass: really locate and write points & dirs
+    uint32_t point_index = atomicAdd(counter, num_steps);
+    uint32_t ray_index = atomicAdd(counter + 1, 1);
+    //printf("[n=%d] num_steps=%d, point_index=%d, ray_index=%d\n", n, num_steps, point_index, ray_index);
+    // write rays
+    rays[ray_index * 3] = n;
+    rays[ray_index * 3 + 1] = point_index;
+    rays[ray_index * 3 + 2] = num_steps;
+    if (num_steps == 0) return;
+    if (point_index + num_steps > M) return;
+    xyzs += point_index * 3;
+    dirs += point_index * 3;
+    deltas += point_index * 2;
+    t = t0;
+    uint32_t step = 0;
+    float last_t = t;
+    while (t < far && step < num_steps) {
+        // current point
+        const float x = clamp(ox + t * dx, -bound, bound);
+        const float y = clamp(oy + t * dy, -bound, bound);
+        const float z = clamp(oz + t * dz, -bound, bound);
+        const float dt = clamp(t * dt_gamma, dt_min, dt_max);
+        // get mip level
+        const int level = max(mip_from_pos(x, y, z, C), mip_from_dt(dt, H, C)); // range in [0, C - 1]
+        const float mip_bound = fminf(scalbnf(1.0f, level), bound);
+        const float mip_rbound = 1 / mip_bound;
+        // convert to nearest grid position
+        const int nx = clamp(0.5 * (x * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        const int ny = clamp(0.5 * (y * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        const int nz = clamp(0.5 * (z * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        // query grid
+        const uint32_t index = level * H3 + __morton3D(nx, ny, nz);
+        const bool occ = grid[index / 8] & (1 << (index % 8));
+        // if occpuied, advance a small step, and write to output
+        if (occ) {
+            // write step
+            xyzs[0] = x;
+            xyzs[1] = y;
+            xyzs[2] = z;
+            dirs[0] = dx;
+            dirs[1] = dy;
+            dirs[2] = dz;
+            t += dt;
+            deltas[0] = dt;
+            deltas[1] = t - last_t; // used to calc depth
+            last_t = t;
+            xyzs += 3;
+            dirs += 3;
+            deltas += 2;
+            step++;
+        // else, skip a large step (basically skip a voxel grid)
+        } else {
+            // calc distance to next voxel
+            const float tx = (((nx + 0.5f + 0.5f * signf(dx)) * rH * 2 - 1) * mip_bound - x) * rdx;
+            const float ty = (((ny + 0.5f + 0.5f * signf(dy)) * rH * 2 - 1) * mip_bound - y) * rdy;
+            const float tz = (((nz + 0.5f + 0.5f * signf(dz)) * rH * 2 - 1) * mip_bound - z) * rdz;
+            const float tt = t + fmaxf(0.0f, fminf(tx, fminf(ty, tz)));
+            // step until next voxel
+            do {
+                t += clamp(t * dt_gamma, dt_min, dt_max);
+            } while (t < tt);
+        }
+    }
+}
+void march_rays_train(const at::Tensor rays_o, const at::Tensor rays_d, const at::Tensor grid, const float bound, const float dt_gamma, const uint32_t max_steps, const uint32_t N, const uint32_t C, const uint32_t H, const uint32_t M, const at::Tensor nears, const at::Tensor fars, at::Tensor xyzs, at::Tensor dirs, at::Tensor deltas, at::Tensor rays, at::Tensor counter, at::Tensor noises) {
+    static constexpr uint32_t N_THREAD = 128;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    rays_o.scalar_type(), "march_rays_train", ([&] {
+        kernel_march_rays_train<<<div_round_up(N, N_THREAD), N_THREAD>>>(rays_o.data_ptr<scalar_t>(), rays_d.data_ptr<scalar_t>(), grid.data_ptr<uint8_t>(), bound, dt_gamma, max_steps, N, C, H, M, nears.data_ptr<scalar_t>(), fars.data_ptr<scalar_t>(), xyzs.data_ptr<scalar_t>(), dirs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), rays.data_ptr<int>(), counter.data_ptr<int>(), noises.data_ptr<scalar_t>());
+    }));
+}
+// sigmas: [M]
+// rgbs: [M, 3]
+// deltas: [M, 2]
+// rays: [N, 3], idx, offset, num_steps
+// weights_sum: [N], final pixel alpha
+// depth: [N,]
+// image: [N, 3]
+template <typename scalar_t>
+__global__ void kernel_composite_rays_train_forward(
+    const scalar_t * __restrict__ sigmas,
+    const scalar_t * __restrict__ rgbs,
+    const scalar_t * __restrict__ deltas,
+    const int * __restrict__ rays,
+    const uint32_t M, const uint32_t N, const float T_thresh,
+    scalar_t * weights_sum,
+    scalar_t * depth,
+    scalar_t * image
+) {
+    // parallel per ray
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= N) return;
+    // locate
+    uint32_t index = rays[n * 3];
+    uint32_t offset = rays[n * 3 + 1];
+    uint32_t num_steps = rays[n * 3 + 2];
+    // empty ray, or ray that exceed max step count.
+    if (num_steps == 0 || offset + num_steps > M) {
+        weights_sum[index] = 0;
+        depth[index] = 0;
+        image[index * 3] = 0;
+        image[index * 3 + 1] = 0;
+        image[index * 3 + 2] = 0;
+        return;
+    }
+    sigmas += offset;
+    rgbs += offset * 3;
+    deltas += offset * 2;
+    // accumulate
+    uint32_t step = 0;
+    scalar_t T = 1.0f;
+    scalar_t r = 0, g = 0, b = 0, ws = 0, t = 0, d = 0;
+    while (step < num_steps) {
+        const scalar_t alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
+        const scalar_t weight = alpha * T;
+        r += weight * rgbs[0];
+        g += weight * rgbs[1];
+        b += weight * rgbs[2];
+        t += deltas[1]; // real delta
+        d += weight * t;
+        ws += weight;
+        T *= 1.0f - alpha;
+        // minimal remained transmittence
+        if (T < T_thresh) break;
+        //printf("[n=%d] num_steps=%d, alpha=%f, w=%f, T=%f, sum_dt=%f, d=%f\n", n, step, alpha, weight, T, sum_delta, d);
+        // locate
+        sigmas++;
+        rgbs += 3;
+        deltas += 2;
+        step++;
+    }
+    //printf("[n=%d] rgb=(%f, %f, %f), d=%f\n", n, r, g, b, d);
+    // write
+    weights_sum[index] = ws; // weights_sum
+    depth[index] = d;
+    image[index * 3] = r;
+    image[index * 3 + 1] = g;
+    image[index * 3 + 2] = b;
+}
+void composite_rays_train_forward(const at::Tensor sigmas, const at::Tensor rgbs, const at::Tensor deltas, const at::Tensor rays, const uint32_t M, const uint32_t N, const float T_thresh, at::Tensor weights_sum, at::Tensor depth, at::Tensor image) {
+    static constexpr uint32_t N_THREAD = 128;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    sigmas.scalar_type(), "composite_rays_train_forward", ([&] {
+        kernel_composite_rays_train_forward<<<div_round_up(N, N_THREAD), N_THREAD>>>(sigmas.data_ptr<scalar_t>(), rgbs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), rays.data_ptr<int>(), M, N, T_thresh, weights_sum.data_ptr<scalar_t>(), depth.data_ptr<scalar_t>(), image.data_ptr<scalar_t>());
+    }));
+}
+// grad_weights_sum: [N,]
+// grad: [N, 3]
+// sigmas: [M]
+// rgbs: [M, 3]
+// deltas: [M, 2]
+// rays: [N, 3], idx, offset, num_steps
+// weights_sum: [N,], weights_sum here
+// image: [N, 3]
+// grad_sigmas: [M]
+// grad_rgbs: [M, 3]
+template <typename scalar_t>
+__global__ void kernel_composite_rays_train_backward(
+    const scalar_t * __restrict__ grad_weights_sum,
+    const scalar_t * __restrict__ grad_image,
+    const scalar_t * __restrict__ sigmas,
+    const scalar_t * __restrict__ rgbs,
+    const scalar_t * __restrict__ deltas,
+    const int * __restrict__ rays,
+    const scalar_t * __restrict__ weights_sum,
+    const scalar_t * __restrict__ image,
+    const uint32_t M, const uint32_t N, const float T_thresh,
+    scalar_t * grad_sigmas,
+    scalar_t * grad_rgbs
+) {
+    // parallel per ray
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= N) return;
+    // locate
+    uint32_t index = rays[n * 3];
+    uint32_t offset = rays[n * 3 + 1];
+    uint32_t num_steps = rays[n * 3 + 2];
+    if (num_steps == 0 || offset + num_steps > M) return;
+    grad_weights_sum += index;
+    grad_image += index * 3;
+    weights_sum += index;
+    image += index * 3;
+    sigmas += offset;
+    rgbs += offset * 3;
+    deltas += offset * 2;
+    grad_sigmas += offset;
+    grad_rgbs += offset * 3;
+    // accumulate
+    uint32_t step = 0;
+    scalar_t T = 1.0f;
+    const scalar_t r_final = image[0], g_final = image[1], b_final = image[2], ws_final = weights_sum[0];
+    scalar_t r = 0, g = 0, b = 0, ws = 0;
+    while (step < num_steps) {
+        const scalar_t alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
+        const scalar_t weight = alpha * T;
+        r += weight * rgbs[0];
+        g += weight * rgbs[1];
+        b += weight * rgbs[2];
+        ws += weight;
+        T *= 1.0f - alpha;
+        // check https://note.kiui.moe/others/nerf_gradient/ for the gradient calculation.
+        // write grad_rgbs
+        grad_rgbs[0] = grad_image[0] * weight;
+        grad_rgbs[1] = grad_image[1] * weight;
+        grad_rgbs[2] = grad_image[2] * weight;
+        // write grad_sigmas
+        grad_sigmas[0] = deltas[0] * (
+            grad_image[0] * (T * rgbs[0] - (r_final - r)) +
+            grad_image[1] * (T * rgbs[1] - (g_final - g)) +
+            grad_image[2] * (T * rgbs[2] - (b_final - b)) +
+            grad_weights_sum[0] * (1 - ws_final)
+        );
+        //printf("[n=%d] num_steps=%d, T=%f, grad_sigmas=%f, r_final=%f, r=%f\n", n, step, T, grad_sigmas[0], r_final, r);
+        // minimal remained transmittence
+        if (T < T_thresh) break;
+        // locate
+        sigmas++;
+        rgbs += 3;
+        deltas += 2;
+        grad_sigmas++;
+        grad_rgbs += 3;
+        step++;
+    }
+}
+void composite_rays_train_backward(const at::Tensor grad_weights_sum, const at::Tensor grad_image, const at::Tensor sigmas, const at::Tensor rgbs, const at::Tensor deltas, const at::Tensor rays, const at::Tensor weights_sum, const at::Tensor image, const uint32_t M, const uint32_t N, const float T_thresh, at::Tensor grad_sigmas, at::Tensor grad_rgbs) {
+    static constexpr uint32_t N_THREAD = 128;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    grad_image.scalar_type(), "composite_rays_train_backward", ([&] {
+        kernel_composite_rays_train_backward<<<div_round_up(N, N_THREAD), N_THREAD>>>(grad_weights_sum.data_ptr<scalar_t>(), grad_image.data_ptr<scalar_t>(), sigmas.data_ptr<scalar_t>(), rgbs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), rays.data_ptr<int>(), weights_sum.data_ptr<scalar_t>(), image.data_ptr<scalar_t>(), M, N, T_thresh, grad_sigmas.data_ptr<scalar_t>(), grad_rgbs.data_ptr<scalar_t>());
+    }));
+}
+////////////////////////////////////////////////////
+/////////////          infernce        /////////////
+////////////////////////////////////////////////////
+template <typename scalar_t>
+__global__ void kernel_march_rays(
+    const uint32_t n_alive,
+    const uint32_t n_step,
+    const int* __restrict__ rays_alive,
+    const scalar_t* __restrict__ rays_t,
+    const scalar_t* __restrict__ rays_o,
+    const scalar_t* __restrict__ rays_d,
+    const float bound,
+    const float dt_gamma, const uint32_t max_steps,
+    const uint32_t C, const uint32_t H,
+    const uint8_t * __restrict__ grid,
+    const scalar_t* __restrict__ nears,
+    const scalar_t* __restrict__ fars,
+    scalar_t* xyzs, scalar_t* dirs, scalar_t* deltas,
+    const scalar_t* __restrict__ noises
+) {
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= n_alive) return;
+    const int index = rays_alive[n]; // ray id
+    const float noise = noises[n];
+    // locate
+    rays_o += index * 3;
+    rays_d += index * 3;
+    xyzs += n * n_step * 3;
+    dirs += n * n_step * 3;
+    deltas += n * n_step * 2;
+    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
+    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
+    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
+    const float rH = 1 / (float)H;
+    const float H3 = H * H * H;
+    float t = rays_t[index]; // current ray's t
+    const float near = nears[index], far = fars[index];
+    const float dt_min = 2 * SQRT3() / max_steps;
+    const float dt_max = 2 * SQRT3() * (1 << (C - 1)) / H;
+    // march for n_step steps, record points
+    uint32_t step = 0;
+    // introduce some randomness
+    t += clamp(t * dt_gamma, dt_min, dt_max) * noise;
+    float last_t = t;
+    while (t < far && step < n_step) {
+        // current point
+        const float x = clamp(ox + t * dx, -bound, bound);
+        const float y = clamp(oy + t * dy, -bound, bound);
+        const float z = clamp(oz + t * dz, -bound, bound);
+        const float dt = clamp(t * dt_gamma, dt_min, dt_max);
+        // get mip level
+        const int level = max(mip_from_pos(x, y, z, C), mip_from_dt(dt, H, C)); // range in [0, C - 1]
+        const float mip_bound = fminf(scalbnf(1, level), bound);
+        const float mip_rbound = 1 / mip_bound;
+        // convert to nearest grid position
+        const int nx = clamp(0.5 * (x * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        const int ny = clamp(0.5 * (y * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        const int nz = clamp(0.5 * (z * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
+        const uint32_t index = level * H3 + __morton3D(nx, ny, nz);
+        const bool occ = grid[index / 8] & (1 << (index % 8));
+        // if occpuied, advance a small step, and write to output
+        if (occ) {
+            // write step
+            xyzs[0] = x;
+            xyzs[1] = y;
+            xyzs[2] = z;
+            dirs[0] = dx;
+            dirs[1] = dy;
+            dirs[2] = dz;
+            // calc dt
+            t += dt;
+            deltas[0] = dt;
+            deltas[1] = t - last_t; // used to calc depth
+            last_t = t;
+            // step
+            xyzs += 3;
+            dirs += 3;
+            deltas += 2;
+            step++;
+        // else, skip a large step (basically skip a voxel grid)
+        } else {
+            // calc distance to next voxel
+            const float tx = (((nx + 0.5f + 0.5f * signf(dx)) * rH * 2 - 1) * mip_bound - x) * rdx;
+            const float ty = (((ny + 0.5f + 0.5f * signf(dy)) * rH * 2 - 1) * mip_bound - y) * rdy;
+            const float tz = (((nz + 0.5f + 0.5f * signf(dz)) * rH * 2 - 1) * mip_bound - z) * rdz;
+            const float tt = t + fmaxf(0.0f, fminf(tx, fminf(ty, tz)));
+            // step until next voxel
+            do {
+                t += clamp(t * dt_gamma, dt_min, dt_max);
+            } while (t < tt);
+        }
+    }
+}
+void march_rays(const uint32_t n_alive, const uint32_t n_step, const at::Tensor rays_alive, const at::Tensor rays_t, const at::Tensor rays_o, const at::Tensor rays_d, const float bound, const float dt_gamma, const uint32_t max_steps, const uint32_t C, const uint32_t H, const at::Tensor grid, const at::Tensor near, const at::Tensor far, at::Tensor xyzs, at::Tensor dirs, at::Tensor deltas, at::Tensor noises) {
+    static constexpr uint32_t N_THREAD = 128;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    rays_o.scalar_type(), "march_rays", ([&] {
+        kernel_march_rays<<<div_round_up(n_alive, N_THREAD), N_THREAD>>>(n_alive, n_step, rays_alive.data_ptr<int>(), rays_t.data_ptr<scalar_t>(), rays_o.data_ptr<scalar_t>(), rays_d.data_ptr<scalar_t>(), bound, dt_gamma, max_steps, C, H, grid.data_ptr<uint8_t>(), near.data_ptr<scalar_t>(), far.data_ptr<scalar_t>(), xyzs.data_ptr<scalar_t>(), dirs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), noises.data_ptr<scalar_t>());
+    }));
+}
+template <typename scalar_t>
+__global__ void kernel_composite_rays(
+    const uint32_t n_alive,
+    const uint32_t n_step,
+    const float T_thresh,
+    int* rays_alive,
+    scalar_t* rays_t,
+    const scalar_t* __restrict__ sigmas,
+    const scalar_t* __restrict__ rgbs,
+    const scalar_t* __restrict__ deltas,
+    scalar_t* weights_sum, scalar_t* depth, scalar_t* image
+) {
+    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
+    if (n >= n_alive) return;
+    const int index = rays_alive[n]; // ray id
+    // locate
+    sigmas += n * n_step;
+    rgbs += n * n_step * 3;
+    deltas += n * n_step * 2;
+    rays_t += index;
+    weights_sum += index;
+    depth += index;
+    image += index * 3;
+    scalar_t t = rays_t[0]; // current ray's t
+    scalar_t weight_sum = weights_sum[0];
+    scalar_t d = depth[0];
+    scalar_t r = image[0];
+    scalar_t g = image[1];
+    scalar_t b = image[2];
+    // accumulate
+    uint32_t step = 0;
+    while (step < n_step) {
+        // ray is terminated if delta == 0
+        if (deltas[0] == 0) break;
+        const scalar_t alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
+        /*
+        T_0 = 1; T_i = \prod_{j=0}^{i-1} (1 - alpha_j)
+        w_i = alpha_i * T_i
+        -->
+        T_i = 1 - \sum_{j=0}^{i-1} w_j
+        */
+        const scalar_t T = 1 - weight_sum;
+        const scalar_t weight = alpha * T;
+        weight_sum += weight;
+        t += deltas[1]; // real delta
+        d += weight * t;
+        r += weight * rgbs[0];
+        g += weight * rgbs[1];
+        b += weight * rgbs[2];
+        //printf("[n=%d] num_steps=%d, alpha=%f, w=%f, T=%f, sum_dt=%f, d=%f\n", n, step, alpha, weight, T, sum_delta, d);
+        // ray is terminated if T is too small
+        // use a larger bound to further accelerate inference
+        if (T < T_thresh) break;
+        // locate
+        sigmas++;
+        rgbs += 3;
+        deltas += 2;
+        step++;
+    }
+    //printf("[n=%d] rgb=(%f, %f, %f), d=%f\n", n, r, g, b, d);
+    // rays_alive = -1 means ray is terminated early.
+    if (step < n_step) {
+        rays_alive[n] = -1;
+    } else {
+        rays_t[0] = t;
+    }
+    weights_sum[0] = weight_sum; // this is the thing I needed!
+    depth[0] = d;
+    image[0] = r;
+    image[1] = g;
+    image[2] = b;
+}
+void composite_rays(const uint32_t n_alive, const uint32_t n_step, const float T_thresh, at::Tensor rays_alive, at::Tensor rays_t, at::Tensor sigmas, at::Tensor rgbs, at::Tensor deltas, at::Tensor weights, at::Tensor depth, at::Tensor image) {
+    static constexpr uint32_t N_THREAD = 128;
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    image.scalar_type(), "composite_rays", ([&] {
+        kernel_composite_rays<<<div_round_up(n_alive, N_THREAD), N_THREAD>>>(n_alive, n_step, T_thresh, rays_alive.data_ptr<int>(), rays_t.data_ptr<scalar_t>(), sigmas.data_ptr<scalar_t>(), rgbs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), weights.data_ptr<scalar_t>(), depth.data_ptr<scalar_t>(), image.data_ptr<scalar_t>());
+    }));
+}

raymarching/src/raymarching.h ADDED Viewed

	@@ -0,0 +1,18 @@

+#pragma once
+#include <stdint.h>
+#include <torch/torch.h>
+void near_far_from_aabb(const at::Tensor rays_o, const at::Tensor rays_d, const at::Tensor aabb, const uint32_t N, const float min_near, at::Tensor nears, at::Tensor fars);
+void sph_from_ray(const at::Tensor rays_o, const at::Tensor rays_d, const float radius, const uint32_t N, at::Tensor coords);
+void morton3D(const at::Tensor coords, const uint32_t N, at::Tensor indices);
+void morton3D_invert(const at::Tensor indices, const uint32_t N, at::Tensor coords);
+void packbits(const at::Tensor grid, const uint32_t N, const float density_thresh, at::Tensor bitfield);
+void march_rays_train(const at::Tensor rays_o, const at::Tensor rays_d, const at::Tensor grid, const float bound, const float dt_gamma, const uint32_t max_steps, const uint32_t N, const uint32_t C, const uint32_t H, const uint32_t M, const at::Tensor nears, const at::Tensor fars, at::Tensor xyzs, at::Tensor dirs, at::Tensor deltas, at::Tensor rays, at::Tensor counter, at::Tensor noises);
+void composite_rays_train_forward(const at::Tensor sigmas, const at::Tensor rgbs, const at::Tensor deltas, const at::Tensor rays, const uint32_t M, const uint32_t N, const float T_thresh, at::Tensor weights_sum, at::Tensor depth, at::Tensor image);
+void composite_rays_train_backward(const at::Tensor grad_weights_sum, const at::Tensor grad_image, const at::Tensor sigmas, const at::Tensor rgbs, const at::Tensor deltas, const at::Tensor rays, const at::Tensor weights_sum, const at::Tensor image, const uint32_t M, const uint32_t N, const float T_thresh, at::Tensor grad_sigmas, at::Tensor grad_rgbs);
+void march_rays(const uint32_t n_alive, const uint32_t n_step, const at::Tensor rays_alive, const at::Tensor rays_t, const at::Tensor rays_o, const at::Tensor rays_d, const float bound, const float dt_gamma, const uint32_t max_steps, const uint32_t C, const uint32_t H, const at::Tensor grid, const at::Tensor nears, const at::Tensor fars, at::Tensor xyzs, at::Tensor dirs, at::Tensor deltas, at::Tensor noises);
+void composite_rays(const uint32_t n_alive, const uint32_t n_step, const float T_thresh, at::Tensor rays_alive, at::Tensor rays_t, at::Tensor sigmas, at::Tensor rgbs, at::Tensor deltas, at::Tensor weights_sum, at::Tensor depth, at::Tensor image);

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+torch-ema
+ninja
+trimesh
+opencv-python
+tensorboardX
+torch
+numpy
+pandas
+tqdm
+matplotlib
+PyMCubes
+rich
+dearpygui
+scipy
+huggingface_hub
+diffusers
+transformers
+xatlas
+scikit-learn
+imageio
+imageio-ffmpeg

scripts/install_ext.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+pip install ./raymarching
+pip install ./shencoder
+pip install ./freqencoder
+pip install ./gridencoder

scripts/run.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#! /bin/bash
+CUDA_VISIBLE_DEVICES=1 python main.py -O --text "a DSLR photo of cthulhu" --workspace trial_cthulhu
+CUDA_VISIBLE_DEVICES=1 python main.py -O --text "a DSLR photo of a squirrel" --workspace trial_squirrel
+CUDA_VISIBLE_DEVICES=1 python main.py -O --text "a DSLR photo of a cat lying on its side batting at a ball of yarn" --workspace trial_cat_lying

shencoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .sphere_harmonics import SHEncoder

shencoder/backend.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from torch.utils.cpp_extension import load
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+_backend = load(name='_sh_encoder',
+                extra_cflags=c_flags,
+                extra_cuda_cflags=nvcc_flags,
+                sources=[os.path.join(_src_path, 'src', f) for f in [
+                    'shencoder.cu',
+                    'bindings.cpp',
+                ]],
+                )
+__all__ = ['_backend']

shencoder/setup.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+_src_path = os.path.dirname(os.path.abspath(__file__))
+nvcc_flags = [
+    '-O3', '-std=c++14',
+    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+]
+if os.name == "posix":
+    c_flags = ['-O3', '-std=c++14']
+elif os.name == "nt":
+    c_flags = ['/O2', '/std:c++17']
+    # find cl.exe
+    def find_cl_path():
+        import glob
+        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
+            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
+            if paths:
+                return paths[0]
+    # If cl.exe is not on path, try to find it.
+    if os.system("where cl.exe >nul 2>nul") != 0:
+        cl_path = find_cl_path()
+        if cl_path is None:
+            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+        os.environ["PATH"] += ";" + cl_path
+setup(
+    name='shencoder', # package name, import this to use python API
+    ext_modules=[
+        CUDAExtension(
+            name='_shencoder', # extension name, import this to use CUDA API
+            sources=[os.path.join(_src_path, 'src', f) for f in [
+                'shencoder.cu',
+                'bindings.cpp',
+            ]],
+            extra_compile_args={
+                'cxx': c_flags,
+                'nvcc': nvcc_flags,
+            }
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension,
+    }
+)

shencoder/sphere_harmonics.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.cuda.amp import custom_bwd, custom_fwd
+try:
+    import _shencoder as _backend
+except ImportError:
+    from .backend import _backend
+class _sh_encoder(Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32) # force float32 for better precision
+    def forward(ctx, inputs, degree, calc_grad_inputs=False):
+        # inputs: [B, input_dim], float in [-1, 1]
+        # RETURN: [B, F], float
+        inputs = inputs.contiguous()
+        B, input_dim = inputs.shape # batch size, coord dim
+        output_dim = degree ** 2
+        outputs = torch.empty(B, output_dim, dtype=inputs.dtype, device=inputs.device)
+        if calc_grad_inputs:
+            dy_dx = torch.empty(B, input_dim * output_dim, dtype=inputs.dtype, device=inputs.device)
+        else:
+            dy_dx = None
+        _backend.sh_encode_forward(inputs, outputs, B, input_dim, degree, dy_dx)
+        ctx.save_for_backward(inputs, dy_dx)
+        ctx.dims = [B, input_dim, degree]
+        return outputs
+    @staticmethod
+    #@once_differentiable
+    @custom_bwd
+    def backward(ctx, grad):
+        # grad: [B, C * C]
+        inputs, dy_dx = ctx.saved_tensors
+        if dy_dx is not None:
+            grad = grad.contiguous()
+            B, input_dim, degree = ctx.dims
+            grad_inputs = torch.zeros_like(inputs)
+            _backend.sh_encode_backward(grad, inputs, B, input_dim, degree, dy_dx, grad_inputs)
+            return grad_inputs, None, None
+        else:
+            return None, None, None
+sh_encode = _sh_encoder.apply
+class SHEncoder(nn.Module):
+    def __init__(self, input_dim=3, degree=4):
+        super().__init__()
+        self.input_dim = input_dim # coord dims, must be 3
+        self.degree = degree # 0 ~ 4
+        self.output_dim = degree ** 2
+        assert self.input_dim == 3, "SH encoder only support input dim == 3"
+        assert self.degree > 0 and self.degree <= 8, "SH encoder only supports degree in [1, 8]"
+    def __repr__(self):
+        return f"SHEncoder: input_dim={self.input_dim} degree={self.degree}"
+    def forward(self, inputs, size=1):
+        # inputs: [..., input_dim], normalized real world positions in [-size, size]
+        # return: [..., degree^2]
+        inputs = inputs / size # [-1, 1]
+        prefix_shape = list(inputs.shape[:-1])
+        inputs = inputs.reshape(-1, self.input_dim)
+        outputs = sh_encode(inputs, self.degree, inputs.requires_grad)
+        outputs = outputs.reshape(prefix_shape + [self.output_dim])
+        return outputs

shencoder/src/bindings.cpp ADDED Viewed

	@@ -0,0 +1,8 @@

+#include <torch/extension.h>
+#include "shencoder.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("sh_encode_forward", &sh_encode_forward, "SH encode forward (CUDA)");
+    m.def("sh_encode_backward", &sh_encode_backward, "SH encode backward (CUDA)");
+}

shencoder/src/shencoder.cu ADDED Viewed

	@@ -0,0 +1,439 @@

+#include <stdint.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/torch.h>
+#include <algorithm>
+#include <stdexcept>
+#include <cstdio>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor")
+#define CHECK_IS_FLOATING(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float || x.scalar_type() == at::ScalarType::Half || x.scalar_type() == at::ScalarType::Double, #x " must be a floating tensor")
+template <typename T>
+__host__ __device__ T div_round_up(T val, T divisor) {
+	return (val + divisor - 1) / divisor;
+}
+template <typename scalar_t>
+__global__ void kernel_sh(
+    const scalar_t * __restrict__ inputs,
+    scalar_t * outputs,
+    uint32_t B, uint32_t D, uint32_t C,
+    scalar_t * dy_dx
+) {
+	const uint32_t b = threadIdx.x + blockIdx.x * blockDim.x;
+	if (b >= B) return;
+	const uint32_t C2 = C * C;
+	// locate
+	inputs += b * D;
+	outputs += b * C2;
+	scalar_t x = inputs[0], y = inputs[1], z = inputs[2];
+	scalar_t xy=x*y, xz=x*z, yz=y*z, x2=x*x, y2=y*y, z2=z*z, xyz=xy*z;
+	scalar_t x4=x2*x2, y4=y2*y2, z4=z2*z2;
+	scalar_t x6=x4*x2, y6=y4*y2, z6=z4*z2;
+	auto write_sh = [&]() {
+		outputs[0] = 0.28209479177387814f ;                          // 1/(2*sqrt(pi))
+		if (C <= 1) { return; }
+		outputs[1] = -0.48860251190291987f*y ;                               // -sqrt(3)*y/(2*sqrt(pi))
+		outputs[2] = 0.48860251190291987f*z ;                                // sqrt(3)*z/(2*sqrt(pi))
+		outputs[3] = -0.48860251190291987f*x ;                               // -sqrt(3)*x/(2*sqrt(pi))
+		if (C <= 2) { return; }
+		outputs[4] = 1.0925484305920792f*xy ;                                // sqrt(15)*xy/(2*sqrt(pi))
+		outputs[5] = -1.0925484305920792f*yz ;                               // -sqrt(15)*yz/(2*sqrt(pi))
+		outputs[6] = 0.94617469575755997f*z2 - 0.31539156525251999f ;                         // sqrt(5)*(3*z2 - 1)/(4*sqrt(pi))
+		outputs[7] = -1.0925484305920792f*xz ;                               // -sqrt(15)*xz/(2*sqrt(pi))
+		outputs[8] = 0.54627421529603959f*x2 - 0.54627421529603959f*y2 ;                              // sqrt(15)*(x2 - y2)/(4*sqrt(pi))
+		if (C <= 3) { return; }
+		outputs[9] = 0.59004358992664352f*y*(-3.0f*x2 + y2) ;                         // sqrt(70)*y*(-3*x2 + y2)/(8*sqrt(pi))
+		outputs[10] = 2.8906114426405538f*xy*z ;                             // sqrt(105)*xy*z/(2*sqrt(pi))
+		outputs[11] = 0.45704579946446572f*y*(1.0f - 5.0f*z2) ;                                // sqrt(42)*y*(1 - 5*z2)/(8*sqrt(pi))
+		outputs[12] = 0.3731763325901154f*z*(5.0f*z2 - 3.0f) ;                         // sqrt(7)*z*(5*z2 - 3)/(4*sqrt(pi))
+		outputs[13] = 0.45704579946446572f*x*(1.0f - 5.0f*z2) ;                                // sqrt(42)*x*(1 - 5*z2)/(8*sqrt(pi))
+		outputs[14] = 1.4453057213202769f*z*(x2 - y2) ;                              // sqrt(105)*z*(x2 - y2)/(4*sqrt(pi))
+		outputs[15] = 0.59004358992664352f*x*(-x2 + 3.0f*y2) ;                                // sqrt(70)*x*(-x2 + 3*y2)/(8*sqrt(pi))
+		if (C <= 4) { return; }
+		outputs[16] = 2.5033429417967046f*xy*(x2 - y2) ;                             // 3*sqrt(35)*xy*(x2 - y2)/(4*sqrt(pi))
+		outputs[17] = 1.7701307697799304f*yz*(-3.0f*x2 + y2) ;                                // 3*sqrt(70)*yz*(-3*x2 + y2)/(8*sqrt(pi))
+		outputs[18] = 0.94617469575756008f*xy*(7.0f*z2 - 1.0f) ;                               // 3*sqrt(5)*xy*(7*z2 - 1)/(4*sqrt(pi))
+		outputs[19] = 0.66904654355728921f*yz*(3.0f - 7.0f*z2) ;                               // 3*sqrt(10)*yz*(3 - 7*z2)/(8*sqrt(pi))
+		outputs[20] = -3.1735664074561294f*z2 + 3.7024941420321507f*z4 + 0.31735664074561293f ;                                // 3*(-30*z2 + 35*z4 + 3)/(16*sqrt(pi))
+		outputs[21] = 0.66904654355728921f*xz*(3.0f - 7.0f*z2) ;                               // 3*sqrt(10)*xz*(3 - 7*z2)/(8*sqrt(pi))
+		outputs[22] = 0.47308734787878004f*(x2 - y2)*(7.0f*z2 - 1.0f) ;                                // 3*sqrt(5)*(x2 - y2)*(7*z2 - 1)/(8*sqrt(pi))
+		outputs[23] = 1.7701307697799304f*xz*(-x2 + 3.0f*y2) ;                                // 3*sqrt(70)*xz*(-x2 + 3*y2)/(8*sqrt(pi))
+		outputs[24] = -3.7550144126950569f*x2*y2 + 0.62583573544917614f*x4 + 0.62583573544917614f*y4 ;                         // 3*sqrt(35)*(-6*x2*y2 + x4 + y4)/(16*sqrt(pi))
+		if (C <= 5) { return; }
+		outputs[25] = 0.65638205684017015f*y*(10.0f*x2*y2 - 5.0f*x4 - y4) ;                            // 3*sqrt(154)*y*(10*x2*y2 - 5*x4 - y4)/(32*sqrt(pi))
+		outputs[26] = 8.3026492595241645f*xy*z*(x2 - y2) ;                           // 3*sqrt(385)*xy*z*(x2 - y2)/(4*sqrt(pi))
+		outputs[27] = -0.48923829943525038f*y*(3.0f*x2 - y2)*(9.0f*z2 - 1.0f) ;                         // -sqrt(770)*y*(3*x2 - y2)*(9*z2 - 1)/(32*sqrt(pi))
+		outputs[28] = 4.7935367849733241f*xy*z*(3.0f*z2 - 1.0f) ;                              // sqrt(1155)*xy*z*(3*z2 - 1)/(4*sqrt(pi))
+		outputs[29] = 0.45294665119569694f*y*(14.0f*z2 - 21.0f*z4 - 1.0f) ;                             // sqrt(165)*y*(14*z2 - 21*z4 - 1)/(16*sqrt(pi))
+		outputs[30] = 0.1169503224534236f*z*(-70.0f*z2 + 63.0f*z4 + 15.0f) ;                            // sqrt(11)*z*(-70*z2 + 63*z4 + 15)/(16*sqrt(pi))
+		outputs[31] = 0.45294665119569694f*x*(14.0f*z2 - 21.0f*z4 - 1.0f) ;                             // sqrt(165)*x*(14*z2 - 21*z4 - 1)/(16*sqrt(pi))
+		outputs[32] = 2.3967683924866621f*z*(x2 - y2)*(3.0f*z2 - 1.0f) ;                               // sqrt(1155)*z*(x2 - y2)*(3*z2 - 1)/(8*sqrt(pi))
+		outputs[33] = -0.48923829943525038f*x*(x2 - 3.0f*y2)*(9.0f*z2 - 1.0f) ;                         // -sqrt(770)*x*(x2 - 3*y2)*(9*z2 - 1)/(32*sqrt(pi))
+		outputs[34] = 2.0756623148810411f*z*(-6.0f*x2*y2 + x4 + y4) ;                         // 3*sqrt(385)*z*(-6*x2*y2 + x4 + y4)/(16*sqrt(pi))
+		outputs[35] = 0.65638205684017015f*x*(10.0f*x2*y2 - x4 - 5.0f*y4) ;                            // 3*sqrt(154)*x*(10*x2*y2 - x4 - 5*y4)/(32*sqrt(pi))
+		if (C <= 6) { return; }
+		outputs[36] = 1.3663682103838286f*xy*(-10.0f*x2*y2 + 3.0f*x4 + 3.0f*y4) ;                               // sqrt(6006)*xy*(-10*x2*y2 + 3*x4 + 3*y4)/(32*sqrt(pi))
+		outputs[37] = 2.3666191622317521f*yz*(10.0f*x2*y2 - 5.0f*x4 - y4) ;                            // 3*sqrt(2002)*yz*(10*x2*y2 - 5*x4 - y4)/(32*sqrt(pi))
+		outputs[38] = 2.0182596029148963f*xy*(x2 - y2)*(11.0f*z2 - 1.0f) ;                             // 3*sqrt(91)*xy*(x2 - y2)*(11*z2 - 1)/(8*sqrt(pi))
+		outputs[39] = -0.92120525951492349f*yz*(3.0f*x2 - y2)*(11.0f*z2 - 3.0f) ;                               // -sqrt(2730)*yz*(3*x2 - y2)*(11*z2 - 3)/(32*sqrt(pi))
+		outputs[40] = 0.92120525951492349f*xy*(-18.0f*z2 + 33.0f*z4 + 1.0f) ;                           // sqrt(2730)*xy*(-18*z2 + 33*z4 + 1)/(32*sqrt(pi))
+		outputs[41] = 0.58262136251873131f*yz*(30.0f*z2 - 33.0f*z4 - 5.0f) ;                            // sqrt(273)*yz*(30*z2 - 33*z4 - 5)/(16*sqrt(pi))
+		outputs[42] = 6.6747662381009842f*z2 - 20.024298714302954f*z4 + 14.684485723822165f*z6 - 0.31784601133814211f ;                         // sqrt(13)*(105*z2 - 315*z4 + 231*z6 - 5)/(32*sqrt(pi))
+		outputs[43] = 0.58262136251873131f*xz*(30.0f*z2 - 33.0f*z4 - 5.0f) ;                            // sqrt(273)*xz*(30*z2 - 33*z4 - 5)/(16*sqrt(pi))
+		outputs[44] = 0.46060262975746175f*(x2 - y2)*(11.0f*z2*(3.0f*z2 - 1.0f) - 7.0f*z2 + 1.0f) ;                               // sqrt(2730)*(x2 - y2)*(11*z2*(3*z2 - 1) - 7*z2 + 1)/(64*sqrt(pi))
+		outputs[45] = -0.92120525951492349f*xz*(x2 - 3.0f*y2)*(11.0f*z2 - 3.0f) ;                               // -sqrt(2730)*xz*(x2 - 3*y2)*(11*z2 - 3)/(32*sqrt(pi))
+		outputs[46] = 0.50456490072872406f*(11.0f*z2 - 1.0f)*(-6.0f*x2*y2 + x4 + y4) ;                          // 3*sqrt(91)*(11*z2 - 1)*(-6*x2*y2 + x4 + y4)/(32*sqrt(pi))
+		outputs[47] = 2.3666191622317521f*xz*(10.0f*x2*y2 - x4 - 5.0f*y4) ;                            // 3*sqrt(2002)*xz*(10*x2*y2 - x4 - 5*y4)/(32*sqrt(pi))
+		outputs[48] = 10.247761577878714f*x2*y4 - 10.247761577878714f*x4*y2 + 0.6831841051919143f*x6 - 0.6831841051919143f*y6 ;                         // sqrt(6006)*(15*x2*y4 - 15*x4*y2 + x6 - y6)/(64*sqrt(pi))
+		if (C <= 7) { return; }
+		outputs[49] = 0.70716273252459627f*y*(-21.0f*x2*y4 + 35.0f*x4*y2 - 7.0f*x6 + y6) ;                              // 3*sqrt(715)*y*(-21*x2*y4 + 35*x4*y2 - 7*x6 + y6)/(64*sqrt(pi))
+		outputs[50] = 5.2919213236038001f*xy*z*(-10.0f*x2*y2 + 3.0f*x4 + 3.0f*y4) ;                             // 3*sqrt(10010)*xy*z*(-10*x2*y2 + 3*x4 + 3*y4)/(32*sqrt(pi))
+		outputs[51] = -0.51891557872026028f*y*(13.0f*z2 - 1.0f)*(-10.0f*x2*y2 + 5.0f*x4 + y4) ;                          // -3*sqrt(385)*y*(13*z2 - 1)*(-10*x2*y2 + 5*x4 + y4)/(64*sqrt(pi))
+		outputs[52] = 4.1513246297620823f*xy*z*(x2 - y2)*(13.0f*z2 - 3.0f) ;                           // 3*sqrt(385)*xy*z*(x2 - y2)*(13*z2 - 3)/(8*sqrt(pi))
+		outputs[53] = -0.15645893386229404f*y*(3.0f*x2 - y2)*(13.0f*z2*(11.0f*z2 - 3.0f) - 27.0f*z2 + 3.0f) ;                              // -3*sqrt(35)*y*(3*x2 - y2)*(13*z2*(11*z2 - 3) - 27*z2 + 3)/(64*sqrt(pi))
+		outputs[54] = 0.44253269244498261f*xy*z*(-110.0f*z2 + 143.0f*z4 + 15.0f) ;                              // 3*sqrt(70)*xy*z*(-110*z2 + 143*z4 + 15)/(32*sqrt(pi))
+		outputs[55] = 0.090331607582517306f*y*(-135.0f*z2 + 495.0f*z4 - 429.0f*z6 + 5.0f) ;                              // sqrt(105)*y*(-135*z2 + 495*z4 - 429*z6 + 5)/(64*sqrt(pi))
+		outputs[56] = 0.068284276912004949f*z*(315.0f*z2 - 693.0f*z4 + 429.0f*z6 - 35.0f) ;                              // sqrt(15)*z*(315*z2 - 693*z4 + 429*z6 - 35)/(32*sqrt(pi))
+		outputs[57] = 0.090331607582517306f*x*(-135.0f*z2 + 495.0f*z4 - 429.0f*z6 + 5.0f) ;                              // sqrt(105)*x*(-135*z2 + 495*z4 - 429*z6 + 5)/(64*sqrt(pi))
+		outputs[58] = 0.07375544874083044f*z*(x2 - y2)*(143.0f*z2*(3.0f*z2 - 1.0f) - 187.0f*z2 + 45.0f) ;                         // sqrt(70)*z*(x2 - y2)*(143*z2*(3*z2 - 1) - 187*z2 + 45)/(64*sqrt(pi))
+		outputs[59] = -0.15645893386229404f*x*(x2 - 3.0f*y2)*(13.0f*z2*(11.0f*z2 - 3.0f) - 27.0f*z2 + 3.0f) ;                              // -3*sqrt(35)*x*(x2 - 3*y2)*(13*z2*(11*z2 - 3) - 27*z2 + 3)/(64*sqrt(pi))
+		outputs[60] = 1.0378311574405206f*z*(13.0f*z2 - 3.0f)*(-6.0f*x2*y2 + x4 + y4) ;                         // 3*sqrt(385)*z*(13*z2 - 3)*(-6*x2*y2 + x4 + y4)/(32*sqrt(pi))
+		outputs[61] = -0.51891557872026028f*x*(13.0f*z2 - 1.0f)*(-10.0f*x2*y2 + x4 + 5.0f*y4) ;                          // -3*sqrt(385)*x*(13*z2 - 1)*(-10*x2*y2 + x4 + 5*y4)/(64*sqrt(pi))
+		outputs[62] = 2.6459606618019f*z*(15.0f*x2*y4 - 15.0f*x4*y2 + x6 - y6) ;                               // 3*sqrt(10010)*z*(15*x2*y4 - 15*x4*y2 + x6 - y6)/(64*sqrt(pi))
+		outputs[63] = 0.70716273252459627f*x*(-35.0f*x2*y4 + 21.0f*x4*y2 - x6 + 7.0f*y6) ;                              // 3*sqrt(715)*x*(-35*x2*y4 + 21*x4*y2 - x6 + 7*y6)/(64*sqrt(pi))
+	};
+	write_sh();
+	if (dy_dx) {
+		scalar_t *dx = dy_dx + b * D * C2;
+		scalar_t *dy = dx + C2;
+		scalar_t *dz = dy + C2;
+		auto write_sh_dx = [&]() {
+			dx[0] = 0.0f ;                             // 0
+			if (C <= 1) { return; }
+			dx[1] = 0.0f ;                             // 0
+			dx[2] = 0.0f ;                             // 0
+			dx[3] = -0.48860251190291992f ;                          // -sqrt(3)/(2*sqrt(pi))
+			if (C <= 2) { return; }
+			dx[4] = 1.0925484305920792f*y ;                          // sqrt(15)*y/(2*sqrt(pi))
+			dx[5] = 0.0f ;                             // 0
+			dx[6] = 0.0f ;                             // 0
+			dx[7] = -1.0925484305920792f*z ;                         // -sqrt(15)*z/(2*sqrt(pi))
+			dx[8] = 1.0925484305920792f*x ;                          // sqrt(15)*x/(2*sqrt(pi))
+			if (C <= 3) { return; }
+			dx[9] = -3.5402615395598609f*xy ;                                // -3*sqrt(70)*xy/(4*sqrt(pi))
+			dx[10] = 2.8906114426405538f*yz ;                                // sqrt(105)*yz/(2*sqrt(pi))
+			dx[11] = 0.0f ;                            // 0
+			dx[12] = 0.0f ;                            // 0
+			dx[13] = 0.45704579946446572f - 2.2852289973223288f*z2 ;                          // sqrt(42)*(1 - 5*z2)/(8*sqrt(pi))
+			dx[14] = 2.8906114426405538f*xz ;                                // sqrt(105)*xz/(2*sqrt(pi))
+			dx[15] = -1.7701307697799304f*x2 + 1.7701307697799304f*y2 ;                               // 3*sqrt(70)*(-x2 + y2)/(8*sqrt(pi))
+			if (C <= 4) { return; }
+			dx[16] = 2.5033429417967046f*y*(3.0f*x2 - y2) ;                           // 3*sqrt(35)*y*(3*x2 - y2)/(4*sqrt(pi))
+			dx[17] = -10.620784618679583f*xy*z ;                             // -9*sqrt(70)*xy*z/(4*sqrt(pi))
+			dx[18] = 0.94617469575756008f*y*(7.0f*z2 - 1.0f) ;                         // 3*sqrt(5)*y*(7*z2 - 1)/(4*sqrt(pi))
+			dx[19] = 0.0f ;                            // 0
+			dx[20] = 0.0f ;                            // 0
+			dx[21] = 0.66904654355728921f*z*(3.0f - 7.0f*z2) ;                         // 3*sqrt(10)*z*(3 - 7*z2)/(8*sqrt(pi))
+			dx[22] = 0.94617469575756008f*x*(7.0f*z2 - 1.0f) ;                         // 3*sqrt(5)*x*(7*z2 - 1)/(4*sqrt(pi))
+			dx[23] = 5.3103923093397913f*z*(-x2 + y2) ;                              // 9*sqrt(70)*z*(-x2 + y2)/(8*sqrt(pi))
+			dx[24] = 2.5033429417967046f*x*(x2 - 3.0f*y2) ;                           // 3*sqrt(35)*x*(x2 - 3*y2)/(4*sqrt(pi))
+			if (C <= 5) { return; }
+			dx[25] = 13.127641136803401f*xy*(-x2 + y2) ;                             // 15*sqrt(154)*xy*(-x2 + y2)/(8*sqrt(pi))
+			dx[26] = 8.3026492595241645f*yz*(3.0f*x2 - y2) ;                          // 3*sqrt(385)*yz*(3*x2 - y2)/(4*sqrt(pi))
+			dx[27] = 2.9354297966115022f*xy*(1.0f - 9.0f*z2) ;                         // 3*sqrt(770)*xy*(1 - 9*z2)/(16*sqrt(pi))
+			dx[28] = 4.7935367849733241f*yz*(3.0f*z2 - 1.0f) ;                         // sqrt(1155)*yz*(3*z2 - 1)/(4*sqrt(pi))
+			dx[29] = 0.0f ;                            // 0
+			dx[30] = 0.0f ;                            // 0
+			dx[31] = 6.3412531167397574f*z2 - 9.5118796751096362f*z4 - 0.45294665119569694f ;                          // sqrt(165)*(14*z2 - 21*z4 - 1)/(16*sqrt(pi))
+			dx[32] = 4.7935367849733241f*xz*(3.0f*z2 - 1.0f) ;                         // sqrt(1155)*xz*(3*z2 - 1)/(4*sqrt(pi))
+			dx[33] = -13.209434084751759f*x2*z2 + 1.4677148983057511f*x2 + 13.209434084751759f*y2*z2 - 1.4677148983057511f*y2 ;                         // 3*sqrt(770)*(-9*x2*z2 + x2 + 9*y2*z2 - y2)/(32*sqrt(pi))
+			dx[34] = 8.3026492595241645f*xz*(x2 - 3.0f*y2) ;                          // 3*sqrt(385)*xz*(x2 - 3*y2)/(4*sqrt(pi))
+			dx[35] = 19.6914617052051f*x2*y2 - 3.2819102842008503f*x4 - 3.2819102842008503f*y4 ;                               // 15*sqrt(154)*(6*x2*y2 - x4 - y4)/(32*sqrt(pi))
+			if (C <= 6) { return; }
+			dx[36] = 4.0991046311514854f*y*(-10.0f*x2*y2 + 5.0f*x4 + y4) ;                             // 3*sqrt(6006)*y*(-10*x2*y2 + 5*x4 + y4)/(32*sqrt(pi))
+			dx[37] = 47.332383244635047f*xy*z*(-x2 + y2) ;                           // 15*sqrt(2002)*xy*z*(-x2 + y2)/(8*sqrt(pi))
+			dx[38] = 2.0182596029148963f*y*(3.0f*x2 - y2)*(11.0f*z2 - 1.0f) ;                           // 3*sqrt(91)*y*(3*x2 - y2)*(11*z2 - 1)/(8*sqrt(pi))
+			dx[39] = 5.5272315570895412f*xy*z*(3.0f - 11.0f*z2) ;                              // 3*sqrt(2730)*xy*z*(3 - 11*z2)/(16*sqrt(pi))
+			dx[40] = 0.92120525951492349f*y*(-18.0f*z2 + 33.0f*z4 + 1.0f) ;                             // sqrt(2730)*y*(-18*z2 + 33*z4 + 1)/(32*sqrt(pi))
+			dx[41] = 0.0f ;                            // 0
+			dx[42] = 0.0f ;                            // 0
+			dx[43] = 0.58262136251873131f*z*(30.0f*z2 - 33.0f*z4 - 5.0f) ;                              // sqrt(273)*z*(30*z2 - 33*z4 - 5)/(16*sqrt(pi))
+			dx[44] = 0.92120525951492349f*x*(-18.0f*z2 + 33.0f*z4 + 1.0f) ;                             // sqrt(2730)*x*(-18*z2 + 33*z4 + 1)/(32*sqrt(pi))
+			dx[45] = -2.7636157785447706f*z*(x2 - y2)*(11.0f*z2 - 3.0f) ;                              // -3*sqrt(2730)*z*(x2 - y2)*(11*z2 - 3)/(32*sqrt(pi))
+			dx[46] = 2.0182596029148963f*x*(x2 - 3.0f*y2)*(11.0f*z2 - 1.0f) ;                           // 3*sqrt(91)*x*(x2 - 3*y2)*(11*z2 - 1)/(8*sqrt(pi))
+			dx[47] = 11.833095811158762f*z*(6.0f*x2*y2 - x4 - y4) ;                           // 15*sqrt(2002)*z*(6*x2*y2 - x4 - y4)/(32*sqrt(pi))
+			dx[48] = 4.0991046311514854f*x*(-10.0f*x2*y2 + x4 + 5.0f*y4) ;                             // 3*sqrt(6006)*x*(-10*x2*y2 + x4 + 5*y4)/(32*sqrt(pi))
+			if (C <= 7) { return; }
+			dx[49] = 9.9002782553443485f*xy*(10.0f*x2*y2 - 3.0f*x4 - 3.0f*y4) ;                         // 21*sqrt(715)*xy*(10*x2*y2 - 3*x4 - 3*y4)/(32*sqrt(pi))
+			dx[50] = 15.875763970811402f*yz*(-10.0f*x2*y2 + 5.0f*x4 + y4) ;                            // 9*sqrt(10010)*yz*(-10*x2*y2 + 5*x4 + y4)/(32*sqrt(pi))
+			dx[51] = -10.378311574405206f*xy*(x2 - y2)*(13.0f*z2 - 1.0f) ;                             // -15*sqrt(385)*xy*(x2 - y2)*(13*z2 - 1)/(16*sqrt(pi))
+			dx[52] = 4.1513246297620823f*yz*(3.0f*x2 - y2)*(13.0f*z2 - 3.0f) ;                          // 3*sqrt(385)*yz*(3*x2 - y2)*(13*z2 - 3)/(8*sqrt(pi))
+			dx[53] = 0.93875360317376422f*xy*(66.0f*z2 - 143.0f*z4 - 3.0f) ;                            // 9*sqrt(35)*xy*(66*z2 - 143*z4 - 3)/(32*sqrt(pi))
+			dx[54] = 0.44253269244498261f*yz*(-110.0f*z2 + 143.0f*z4 + 15.0f) ;                         // 3*sqrt(70)*yz*(-110*z2 + 143*z4 + 15)/(32*sqrt(pi))
+			dx[55] = 0.0f ;                            // 0
+			dx[56] = 0.0f ;                            // 0
+			dx[57] = -12.194767023639836f*z2 + 44.714145753346067f*z4 - 38.752259652899923f*z6 + 0.45165803791258652f ;                         // sqrt(105)*(-135*z2 + 495*z4 - 429*z6 + 5)/(64*sqrt(pi))
+			dx[58] = 0.44253269244498261f*xz*(-110.0f*z2 + 143.0f*z4 + 15.0f) ;                         // 3*sqrt(70)*xz*(-110*z2 + 143*z4 + 15)/(32*sqrt(pi))
+			dx[59] = 30.97886890473422f*x2*z2 - 67.120882626924143f*x2*z4 - 1.4081304047606462f*x2 - 30.97886890473422f*y2*z2 + 67.120882626924143f*y2*z4 + 1.4081304047606462f*y2 ;                              // 9*sqrt(35)*(66*x2*z2 - 143*x2*z4 - 3*x2 - 66*y2*z2 + 143*y2*z4 + 3*y2)/(64*sqrt(pi))
+			dx[60] = 4.1513246297620823f*xz*(x2 - 3.0f*y2)*(13.0f*z2 - 3.0f) ;                          // 3*sqrt(385)*xz*(x2 - 3*y2)*(13*z2 - 3)/(8*sqrt(pi))
+			dx[61] = -0.51891557872026028f*(13.0f*z2 - 1.0f)*(-10.0f*x2*y2 + 4.0f*x2*(x2 - 5.0f*y2) + x4 + 5.0f*y4) ;                              // -3*sqrt(385)*(13*z2 - 1)*(-10*x2*y2 + 4*x2*(x2 - 5*y2) + x4 + 5*y4)/(64*sqrt(pi))
+			dx[62] = 15.875763970811402f*xz*(-10.0f*x2*y2 + x4 + 5.0f*y4) ;                            // 9*sqrt(10010)*xz*(-10*x2*y2 + x4 + 5*y4)/(32*sqrt(pi))
+			dx[63] = -74.252086915082614f*x2*y4 + 74.252086915082614f*x4*y2 - 4.9501391276721742f*x6 + 4.9501391276721742f*y6 ;                         // 21*sqrt(715)*(-15*x2*y4 + 15*x4*y2 - x6 + y6)/(64*sqrt(pi))
+		};
+		auto write_sh_dy = [&]() {
+			dy[0] = 0.0f ;                             // 0
+			if (C <= 1) { return; }
+			dy[1] = -0.48860251190291992f ;                          // -sqrt(3)/(2*sqrt(pi))
+			dy[2] = 0.0f ;                             // 0
+			dy[3] = 0.0f ;                             // 0
+			if (C <= 2) { return; }
+			dy[4] = 1.0925484305920792f*x ;                          // sqrt(15)*x/(2*sqrt(pi))
+			dy[5] = -1.0925484305920792f*z ;                         // -sqrt(15)*z/(2*sqrt(pi))
+			dy[6] = 0.0f ;                             // 0
+			dy[7] = 0.0f ;                             // 0
+			dy[8] = -1.0925484305920792f*y ;                         // -sqrt(15)*y/(2*sqrt(pi))
+			if (C <= 3) { return; }
+			dy[9] = -1.7701307697799304f*x2 + 1.7701307697799304f*y2 ;                                // 3*sqrt(70)*(-x2 + y2)/(8*sqrt(pi))
+			dy[10] = 2.8906114426405538f*xz ;                                // sqrt(105)*xz/(2*sqrt(pi))
+			dy[11] = 0.45704579946446572f - 2.2852289973223288f*z2 ;                          // sqrt(42)*(1 - 5*z2)/(8*sqrt(pi))
+			dy[12] = 0.0f ;                            // 0
+			dy[13] = 0.0f ;                            // 0
+			dy[14] = -2.8906114426405538f*yz ;                               // -sqrt(105)*yz/(2*sqrt(pi))
+			dy[15] = 3.5402615395598609f*xy ;                                // 3*sqrt(70)*xy/(4*sqrt(pi))
+			if (C <= 4) { return; }
+			dy[16] = 2.5033429417967046f*x*(x2 - 3.0f*y2) ;                           // 3*sqrt(35)*x*(x2 - 3*y2)/(4*sqrt(pi))
+			dy[17] = 5.3103923093397913f*z*(-x2 + y2) ;                              // 9*sqrt(70)*z*(-x2 + y2)/(8*sqrt(pi))
+			dy[18] = 0.94617469575756008f*x*(7.0f*z2 - 1.0f) ;                         // 3*sqrt(5)*x*(7*z2 - 1)/(4*sqrt(pi))
+			dy[19] = 0.66904654355728921f*z*(3.0f - 7.0f*z2) ;                         // 3*sqrt(10)*z*(3 - 7*z2)/(8*sqrt(pi))
+			dy[20] = 0.0f ;                            // 0
+			dy[21] = 0.0f ;                            // 0
+			dy[22] = 0.94617469575756008f*y*(1.0f - 7.0f*z2) ;                         // 3*sqrt(5)*y*(1 - 7*z2)/(4*sqrt(pi))
+			dy[23] = 10.620784618679583f*xy*z ;                              // 9*sqrt(70)*xy*z/(4*sqrt(pi))
+			dy[24] = 2.5033429417967046f*y*(-3.0f*x2 + y2) ;                          // 3*sqrt(35)*y*(-3*x2 + y2)/(4*sqrt(pi))
+			if (C <= 5) { return; }
+			dy[25] = 19.6914617052051f*x2*y2 - 3.2819102842008503f*x4 - 3.2819102842008503f*y4 ;                               // 15*sqrt(154)*(6*x2*y2 - x4 - y4)/(32*sqrt(pi))
+			dy[26] = 8.3026492595241645f*xz*(x2 - 3.0f*y2) ;                          // 3*sqrt(385)*xz*(x2 - 3*y2)/(4*sqrt(pi))
+			dy[27] = -1.4677148983057511f*(x2 - y2)*(9.0f*z2 - 1.0f) ;                         // -3*sqrt(770)*(x2 - y2)*(9*z2 - 1)/(32*sqrt(pi))
+			dy[28] = 4.7935367849733241f*xz*(3.0f*z2 - 1.0f) ;                         // sqrt(1155)*xz*(3*z2 - 1)/(4*sqrt(pi))
+			dy[29] = 6.3412531167397574f*z2 - 9.5118796751096362f*z4 - 0.45294665119569694f ;                          // sqrt(165)*(14*z2 - 21*z4 - 1)/(16*sqrt(pi))
+			dy[30] = 0.0f ;                            // 0
+			dy[31] = 0.0f ;                            // 0
+			dy[32] = 4.7935367849733241f*yz*(1.0f - 3.0f*z2) ;                         // sqrt(1155)*yz*(1 - 3*z2)/(4*sqrt(pi))
+			dy[33] = 2.9354297966115022f*xy*(9.0f*z2 - 1.0f) ;                         // 3*sqrt(770)*xy*(9*z2 - 1)/(16*sqrt(pi))
+			dy[34] = 8.3026492595241645f*yz*(-3.0f*x2 + y2) ;                         // 3*sqrt(385)*yz*(-3*x2 + y2)/(4*sqrt(pi))
+			dy[35] = 13.127641136803401f*xy*(x2 - y2) ;                              // 15*sqrt(154)*xy*(x2 - y2)/(8*sqrt(pi))
+			if (C <= 6) { return; }
+			dy[36] = 4.0991046311514854f*x*(-10.0f*x2*y2 + x4 + 5.0f*y4) ;                             // 3*sqrt(6006)*x*(-10*x2*y2 + x4 + 5*y4)/(32*sqrt(pi))
+			dy[37] = 11.833095811158762f*z*(6.0f*x2*y2 - x4 - y4) ;                           // 15*sqrt(2002)*z*(6*x2*y2 - x4 - y4)/(32*sqrt(pi))
+			dy[38] = 2.0182596029148963f*x*(x2 - 3.0f*y2)*(11.0f*z2 - 1.0f) ;                           // 3*sqrt(91)*x*(x2 - 3*y2)*(11*z2 - 1)/(8*sqrt(pi))
+			dy[39] = -2.7636157785447706f*z*(x2 - y2)*(11.0f*z2 - 3.0f) ;                              // -3*sqrt(2730)*z*(x2 - y2)*(11*z2 - 3)/(32*sqrt(pi))
+			dy[40] = 0.92120525951492349f*x*(-18.0f*z2 + 33.0f*z4 + 1.0f) ;                             // sqrt(2730)*x*(-18*z2 + 33*z4 + 1)/(32*sqrt(pi))
+			dy[41] = 0.58262136251873131f*z*(30.0f*z2 - 33.0f*z4 - 5.0f) ;                              // sqrt(273)*z*(30*z2 - 33*z4 - 5)/(16*sqrt(pi))
+			dy[42] = 0.0f ;                            // 0
+			dy[43] = 0.0f ;                            // 0
+			dy[44] = 0.92120525951492349f*y*(18.0f*z2 - 33.0f*z4 - 1.0f) ;                              // sqrt(2730)*y*(18*z2 - 33*z4 - 1)/(32*sqrt(pi))
+			dy[45] = 5.5272315570895412f*xy*z*(11.0f*z2 - 3.0f) ;                              // 3*sqrt(2730)*xy*z*(11*z2 - 3)/(16*sqrt(pi))
+			dy[46] = -2.0182596029148963f*y*(3.0f*x2 - y2)*(11.0f*z2 - 1.0f) ;                          // -3*sqrt(91)*y*(3*x2 - y2)*(11*z2 - 1)/(8*sqrt(pi))
+			dy[47] = 47.332383244635047f*xy*z*(x2 - y2) ;                            // 15*sqrt(2002)*xy*z*(x2 - y2)/(8*sqrt(pi))
+			dy[48] = 4.0991046311514854f*y*(10.0f*x2*y2 - 5.0f*x4 - y4) ;                              // 3*sqrt(6006)*y*(10*x2*y2 - 5*x4 - y4)/(32*sqrt(pi))
+			if (C <= 7) { return; }
+			dy[49] = -74.252086915082614f*x2*y4 + 74.252086915082614f*x4*y2 - 4.9501391276721742f*x6 + 4.9501391276721742f*y6 ;                         // 21*sqrt(715)*(-15*x2*y4 + 15*x4*y2 - x6 + y6)/(64*sqrt(pi))
+			dy[50] = 15.875763970811402f*xz*(-10.0f*x2*y2 + x4 + 5.0f*y4) ;                            // 9*sqrt(10010)*xz*(-10*x2*y2 + x4 + 5*y4)/(32*sqrt(pi))
+			dy[51] = 0.51891557872026028f*(13.0f*z2 - 1.0f)*(10.0f*x2*y2 - 5.0f*x4 + 4.0f*y2*(5.0f*x2 - y2) - y4) ;                                // 3*sqrt(385)*(13*z2 - 1)*(10*x2*y2 - 5*x4 + 4*y2*(5*x2 - y2) - y4)/(64*sqrt(pi))
+			dy[52] = 4.1513246297620823f*xz*(x2 - 3.0f*y2)*(13.0f*z2 - 3.0f) ;                          // 3*sqrt(385)*xz*(x2 - 3*y2)*(13*z2 - 3)/(8*sqrt(pi))
+			dy[53] = -0.46937680158688211f*(x2 - y2)*(13.0f*z2*(11.0f*z2 - 3.0f) - 27.0f*z2 + 3.0f) ;                             // -9*sqrt(35)*(x2 - y2)*(13*z2*(11*z2 - 3) - 27*z2 + 3)/(64*sqrt(pi))
+			dy[54] = 0.44253269244498261f*xz*(-110.0f*z2 + 143.0f*z4 + 15.0f) ;                         // 3*sqrt(70)*xz*(-110*z2 + 143*z4 + 15)/(32*sqrt(pi))
+			dy[55] = -12.194767023639836f*z2 + 44.714145753346067f*z4 - 38.752259652899923f*z6 + 0.45165803791258652f ;                         // sqrt(105)*(-135*z2 + 495*z4 - 429*z6 + 5)/(64*sqrt(pi))
+			dy[56] = 0.0f ;                            // 0
+			dy[57] = 0.0f ;                            // 0
+			dy[58] = 0.44253269244498261f*yz*(110.0f*z2 - 143.0f*z4 - 15.0f) ;                          // 3*sqrt(70)*yz*(110*z2 - 143*z4 - 15)/(32*sqrt(pi))
+			dy[59] = 0.93875360317376422f*xy*(-66.0f*z2 + 143.0f*z4 + 3.0f) ;                           // 9*sqrt(35)*xy*(-66*z2 + 143*z4 + 3)/(32*sqrt(pi))
+			dy[60] = -4.1513246297620823f*yz*(3.0f*x2 - y2)*(13.0f*z2 - 3.0f) ;                         // -3*sqrt(385)*yz*(3*x2 - y2)*(13*z2 - 3)/(8*sqrt(pi))
+			dy[61] = 10.378311574405206f*xy*(x2 - y2)*(13.0f*z2 - 1.0f) ;                              // 15*sqrt(385)*xy*(x2 - y2)*(13*z2 - 1)/(16*sqrt(pi))
+			dy[62] = 15.875763970811402f*yz*(10.0f*x2*y2 - 5.0f*x4 - y4) ;                             // 9*sqrt(10010)*yz*(10*x2*y2 - 5*x4 - y4)/(32*sqrt(pi))
+			dy[63] = 9.9002782553443485f*xy*(-10.0f*x2*y2 + 3.0f*x4 + 3.0f*y4) ;                                // 21*sqrt(715)*xy*(-10*x2*y2 + 3*x4 + 3*y4)/(32*sqrt(pi))
+		};
+		auto write_sh_dz = [&]() {
+			dz[0] = 0.0f ;                             // 0
+			if (C <= 1) { return; }
+			dz[1] = 0.0f ;                             // 0
+			dz[2] = 0.48860251190291992f ;                           // sqrt(3)/(2*sqrt(pi))
+			dz[3] = 0.0f ;                             // 0
+			if (C <= 2) { return; }
+			dz[4] = 0.0f ;                             // 0
+			dz[5] = -1.0925484305920792f*y ;                         // -sqrt(15)*y/(2*sqrt(pi))
+			dz[6] = 1.8923493915151202f*z ;                          // 3*sqrt(5)*z/(2*sqrt(pi))
+			dz[7] = -1.0925484305920792f*x ;                         // -sqrt(15)*x/(2*sqrt(pi))
+			dz[8] = 0.0f ;                             // 0
+			if (C <= 3) { return; }
+			dz[9] = 0.0f ;                             // 0
+			dz[10] = 2.8906114426405538f*xy ;                                // sqrt(105)*xy/(2*sqrt(pi))
+			dz[11] = -4.5704579946446566f*yz ;                               // -5*sqrt(42)*yz/(4*sqrt(pi))
+			dz[12] = 5.597644988851731f*z2 - 1.1195289977703462f ;                            // 3*sqrt(7)*(5*z2 - 1)/(4*sqrt(pi))
+			dz[13] = -4.5704579946446566f*xz ;                               // -5*sqrt(42)*xz/(4*sqrt(pi))
+			dz[14] = 1.4453057213202769f*x2 - 1.4453057213202769f*y2 ;                                // sqrt(105)*(x2 - y2)/(4*sqrt(pi))
+			dz[15] = 0.0f ;                            // 0
+			if (C <= 4) { return; }
+			dz[16] = 0.0f ;                            // 0
+			dz[17] = 1.7701307697799304f*y*(-3.0f*x2 + y2) ;                          // 3*sqrt(70)*y*(-3*x2 + y2)/(8*sqrt(pi))
+			dz[18] = 13.246445740605839f*xy*z ;                              // 21*sqrt(5)*xy*z/(2*sqrt(pi))
+			dz[19] = 2.0071396306718676f*y*(1.0f - 7.0f*z2) ;                          // 9*sqrt(10)*y*(1 - 7*z2)/(8*sqrt(pi))
+			dz[20] = 14.809976568128603f*pow(z, 3) - 6.3471328149122579f*z ;                          // (105*z**3 - 45*z)/(4*sqrt(pi))
+			dz[21] = 2.0071396306718676f*x*(1.0f - 7.0f*z2) ;                          // 9*sqrt(10)*x*(1 - 7*z2)/(8*sqrt(pi))
+			dz[22] = 6.6232228703029197f*z*(x2 - y2) ;                               // 21*sqrt(5)*z*(x2 - y2)/(4*sqrt(pi))
+			dz[23] = 1.7701307697799304f*x*(-x2 + 3.0f*y2) ;                          // 3*sqrt(70)*x*(-x2 + 3*y2)/(8*sqrt(pi))
+			dz[24] = 0.0f ;                            // 0
+			if (C <= 5) { return; }
+			dz[25] = 0.0f ;                            // 0
+			dz[26] = 8.3026492595241645f*xy*(x2 - y2) ;                              // 3*sqrt(385)*xy*(x2 - y2)/(4*sqrt(pi))
+			dz[27] = 8.8062893898345074f*yz*(-3.0f*x2 + y2) ;                         // 9*sqrt(770)*yz*(-3*x2 + y2)/(16*sqrt(pi))
+			dz[28] = 4.7935367849733241f*xy*(9.0f*z2 - 1.0f) ;                         // sqrt(1155)*xy*(9*z2 - 1)/(4*sqrt(pi))
+			dz[29] = 12.682506233479513f*yz*(1.0f - 3.0f*z2) ;                         // 7*sqrt(165)*yz*(1 - 3*z2)/(4*sqrt(pi))
+			dz[30] = -24.559567715218954f*z2 + 36.839351572828434f*z4 + 1.754254836801354f ;                           // 15*sqrt(11)*(-14*z2 + 21*z4 + 1)/(16*sqrt(pi))
+			dz[31] = 12.682506233479513f*xz*(1.0f - 3.0f*z2) ;                         // 7*sqrt(165)*xz*(1 - 3*z2)/(4*sqrt(pi))
+			dz[32] = 2.3967683924866621f*(x2 - y2)*(9.0f*z2 - 1.0f) ;                          // sqrt(1155)*(x2 - y2)*(9*z2 - 1)/(8*sqrt(pi))
+			dz[33] = 8.8062893898345074f*xz*(-x2 + 3.0f*y2) ;                         // 9*sqrt(770)*xz*(-x2 + 3*y2)/(16*sqrt(pi))
+			dz[34] = -12.453973889286246f*x2*y2 + 2.0756623148810411f*x4 + 2.0756623148810411f*y4 ;                            // 3*sqrt(385)*(-6*x2*y2 + x4 + y4)/(16*sqrt(pi))
+			dz[35] = 0.0f ;                            // 0
+			if (C <= 6) { return; }
+			dz[36] = 0.0f ;                            // 0
+			dz[37] = 2.3666191622317521f*y*(10.0f*x2*y2 - 5.0f*x4 - y4) ;                              // 3*sqrt(2002)*y*(10*x2*y2 - 5*x4 - y4)/(32*sqrt(pi))
+			dz[38] = 44.401711264127719f*xy*z*(x2 - y2) ;                            // 33*sqrt(91)*xy*z*(x2 - y2)/(4*sqrt(pi))
+			dz[39] = -2.7636157785447706f*y*(3.0f*x2 - y2)*(11.0f*z2 - 1.0f) ;                          // -3*sqrt(2730)*y*(3*x2 - y2)*(11*z2 - 1)/(32*sqrt(pi))
+			dz[40] = 11.054463114179082f*xy*z*(11.0f*z2 - 3.0f) ;                              // 3*sqrt(2730)*xy*z*(11*z2 - 3)/(8*sqrt(pi))
+			dz[41] = 2.9131068125936568f*y*(18.0f*z2 - 33.0f*z4 - 1.0f) ;                               // 5*sqrt(273)*y*(18*z2 - 33*z4 - 1)/(16*sqrt(pi))
+			dz[42] = 2.6699064952403937f*z*(-30.0f*z2 + 33.0f*z4 + 5.0f) ;                              // 21*sqrt(13)*z*(-30*z2 + 33*z4 + 5)/(16*sqrt(pi))
+			dz[43] = 2.9131068125936568f*x*(18.0f*z2 - 33.0f*z4 - 1.0f) ;                               // 5*sqrt(273)*x*(18*z2 - 33*z4 - 1)/(16*sqrt(pi))
+			dz[44] = 5.5272315570895412f*z*(x2 - y2)*(11.0f*z2 - 3.0f) ;                               // 3*sqrt(2730)*z*(x2 - y2)*(11*z2 - 3)/(16*sqrt(pi))
+			dz[45] = -2.7636157785447706f*x*(x2 - 3.0f*y2)*(11.0f*z2 - 1.0f) ;                          // -3*sqrt(2730)*x*(x2 - 3*y2)*(11*z2 - 1)/(32*sqrt(pi))
+			dz[46] = 11.10042781603193f*z*(-6.0f*x2*y2 + x4 + y4) ;                           // 33*sqrt(91)*z*(-6*x2*y2 + x4 + y4)/(16*sqrt(pi))
+			dz[47] = 2.3666191622317521f*x*(10.0f*x2*y2 - x4 - 5.0f*y4) ;                              // 3*sqrt(2002)*x*(10*x2*y2 - x4 - 5*y4)/(32*sqrt(pi))
+			dz[48] = 0.0f ;                            // 0
+			if (C <= 7) { return; }
+			dz[49] = 0.0f ;                            // 0
+			dz[50] = 5.2919213236038001f*xy*(-10.0f*x2*y2 + 3.0f*x4 + 3.0f*y4) ;                                // 3*sqrt(10010)*xy*(-10*x2*y2 + 3*x4 + 3*y4)/(32*sqrt(pi))
+			dz[51] = 13.491805046726766f*yz*(10.0f*x2*y2 - 5.0f*x4 - y4) ;                             // 39*sqrt(385)*yz*(10*x2*y2 - 5*x4 - y4)/(32*sqrt(pi))
+			dz[52] = 12.453973889286248f*xy*(x2 - y2)*(13.0f*z2 - 1.0f) ;                              // 9*sqrt(385)*xy*(x2 - y2)*(13*z2 - 1)/(8*sqrt(pi))
+			dz[53] = -6.8841930899409371f*yz*(3.0f*x2 - y2)*(13.0f*z2 - 3.0f) ;                         // -33*sqrt(35)*yz*(3*x2 - y2)*(13*z2 - 3)/(16*sqrt(pi))
+			dz[54] = 2.2126634622249131f*xy*(-66.0f*z2 + 143.0f*z4 + 3.0f) ;                            // 15*sqrt(70)*xy*(-66*z2 + 143*z4 + 3)/(32*sqrt(pi))
+			dz[55] = 1.6259689364853116f*yz*(110.0f*z2 - 143.0f*z4 - 15.0f) ;                           // 9*sqrt(105)*yz*(110*z2 - 143*z4 - 15)/(32*sqrt(pi))
+			dz[56] = 64.528641681844675f*z2 - 236.60501950009714f*z4 + 205.05768356675085f*z6 - 2.3899496919201733f ;                           // 7*sqrt(15)*(135*z2 - 495*z4 + 429*z6 - 5)/(32*sqrt(pi))
+			dz[57] = 1.6259689364853116f*xz*(110.0f*z2 - 143.0f*z4 - 15.0f) ;                           // 9*sqrt(105)*xz*(110*z2 - 143*z4 - 15)/(32*sqrt(pi))
+			dz[58] = 0.07375544874083044f*(x2 - y2)*(143.0f*z2*(3.0f*z2 - 1.0f) + 132.0f*z2*(13.0f*z2 - 5.0f) - 187.0f*z2 + 45.0f) ;                         // sqrt(70)*(x2 - y2)*(143*z2*(3*z2 - 1) + 132*z2*(13*z2 - 5) - 187*z2 + 45)/(64*sqrt(pi))
+			dz[59] = -6.8841930899409371f*xz*(x2 - 3.0f*y2)*(13.0f*z2 - 3.0f) ;                         // -33*sqrt(35)*xz*(x2 - 3*y2)*(13*z2 - 3)/(16*sqrt(pi))
+			dz[60] = 3.1134934723215619f*(13.0f*z2 - 1.0f)*(-6.0f*x2*y2 + x4 + y4) ;                            // 9*sqrt(385)*(13*z2 - 1)*(-6*x2*y2 + x4 + y4)/(32*sqrt(pi))
+			dz[61] = 13.491805046726766f*xz*(10.0f*x2*y2 - x4 - 5.0f*y4) ;                             // 39*sqrt(385)*xz*(10*x2*y2 - x4 - 5*y4)/(32*sqrt(pi))
+			dz[62] = 39.6894099270285f*x2*y4 - 39.6894099270285f*x4*y2 + 2.6459606618019f*x6 - 2.6459606618019f*y6 ;                            // 3*sqrt(10010)*(15*x2*y4 - 15*x4*y2 + x6 - y6)/(64*sqrt(pi))
+			dz[63] = 0.0f ;                            // 0
+		};
+		write_sh_dx();
+		write_sh_dy();
+		write_sh_dz();
+	}
+}
+template <typename scalar_t>
+__global__ void kernel_sh_backward(
+    const scalar_t * __restrict__ grad,
+	const scalar_t * __restrict__ inputs,
+    uint32_t B, uint32_t D, uint32_t C,
+    const scalar_t * __restrict__ dy_dx,
+    scalar_t * grad_inputs
+) {
+	const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x;
+	const uint32_t b = t / D;
+	if (b >= B) return;
+	const uint32_t d = t - b * D;
+	const uint32_t C2 = C * C;
+	// locate
+	grad += b * C2;
+	dy_dx += b * D * C2 + d * C2;
+	for (int ch = 0; ch < C2; ch++) {
+		grad_inputs[t] += grad[ch] * dy_dx[ch];
+		//printf("t=%d, b=%d, d=%d, ch=%d, grad=%f (+= %f * %f)\n", t, b, d, ch, grad_inputs[t], grad[ch], dy_dx[ch]);
+	}
+}
+// inputs: [B, D], float, in [0, 1]
+// outputs: [B, L * C], float
+template <typename scalar_t>
+void sh_encode_forward_cuda(const scalar_t *inputs, scalar_t *outputs, const uint32_t B, const uint32_t D, const uint32_t C, scalar_t *dy_dx) {
+	static constexpr uint32_t N_THREADS = 256;
+	kernel_sh<scalar_t><<<div_round_up(B, N_THREADS), N_THREADS>>>(inputs, outputs, B, D, C, dy_dx);
+}
+template <typename scalar_t>
+void sh_encode_backward_cuda(const scalar_t *grad, const scalar_t *inputs, const uint32_t B, const uint32_t D, const uint32_t C, scalar_t *dy_dx, scalar_t *grad_inputs) {
+	static constexpr uint32_t N_THREADS = 256;
+	kernel_sh_backward<scalar_t><<<div_round_up(B * D, N_THREADS), N_THREADS>>>(grad, inputs, B, D, C, dy_dx, grad_inputs);
+}
+void sh_encode_forward(at::Tensor inputs, at::Tensor outputs, const uint32_t B, const uint32_t D, const uint32_t C, at::optional<at::Tensor> dy_dx) {
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(outputs);
+    // CHECK_CUDA(dy_dx);
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(outputs);
+    // CHECK_CONTIGUOUS(dy_dx);
+    CHECK_IS_FLOATING(inputs);
+    CHECK_IS_FLOATING(outputs);
+    // CHECK_IS_FLOATING(dy_dx);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    inputs.scalar_type(), "sh_encode_forward_cuda", ([&] {
+		sh_encode_forward_cuda<scalar_t>(inputs.data_ptr<scalar_t>(), outputs.data_ptr<scalar_t>(), B, D, C, dy_dx.has_value() ? dy_dx.value().data_ptr<scalar_t>() : nullptr);
+    }));
+}
+void sh_encode_backward(at::Tensor grad, at::Tensor inputs, const uint32_t B, const uint32_t D, const uint32_t C, at::Tensor dy_dx, at::Tensor grad_inputs) {
+    CHECK_CUDA(grad);
+    CHECK_CUDA(inputs);
+    CHECK_CUDA(dy_dx);
+    CHECK_CUDA(grad_inputs);
+    CHECK_CONTIGUOUS(grad);
+    CHECK_CONTIGUOUS(inputs);
+    CHECK_CONTIGUOUS(dy_dx);
+    CHECK_CONTIGUOUS(grad_inputs);
+    CHECK_IS_FLOATING(grad);
+    CHECK_IS_FLOATING(inputs);
+    CHECK_IS_FLOATING(dy_dx);
+    CHECK_IS_FLOATING(grad_inputs);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    grad.scalar_type(), "sh_encode_backward_cuda", ([&] {
+    	sh_encode_backward_cuda<scalar_t>(grad.data_ptr<scalar_t>(), inputs.data_ptr<scalar_t>(), B, D, C, dy_dx.data_ptr<scalar_t>(), grad_inputs.data_ptr<scalar_t>());
+    }));
+}