Spaces:

facebook
/

vggt

Running on Zero

App Files Files Community

JianyuanWang commited on 3 days ago

Commit

febf487

1 Parent(s): 68f369a

init

Browse files

Files changed (30) hide show

.gitattributes +1 -0
.gitignore +174 -0
README.md +5 -5
app.py +272 -0
clean_app.py +229 -0
config/base.yaml +96 -0
demo_hf.py +149 -0
gradio_util.py +297 -0
requirements.txt +28 -0
vggt/heads/camera_head.py +220 -0
vggt/heads/dpt_head.py +521 -0
vggt/heads/head_act.py +97 -0
vggt/heads/track_head.py +267 -0
vggt/heads/utils.py +309 -0
vggt/layers/__init__.py +11 -0
vggt/layers/attention.py +116 -0
vggt/layers/block.py +275 -0
vggt/layers/dino_head.py +58 -0
vggt/layers/drop_path.py +34 -0
vggt/layers/layer_scale.py +27 -0
vggt/layers/mlp.py +40 -0
vggt/layers/patch_embed.py +88 -0
vggt/layers/rope.py +160 -0
vggt/layers/swiglu_ffn.py +72 -0
vggt/layers/vision_transformer.py +408 -0
vggt/models/aggregator.py +473 -0
vggt/models/vggt.py +156 -0
vggt/utils/pose_enc.py +126 -0
vggt/utils/rotation.py +200 -0
viser_fn.py +284 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/** filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Vggt
-emoji: 👁
-colorFrom: green
-colorTo: purple
 sdk: gradio
-sdk_version: 5.18.0
 app_file: app.py
 pinned: false
 license: cc-by-nc-4.0

 ---
+title: vggt
+emoji: 🏆
+colorFrom: indigo
+colorTo: indigo
 sdk: gradio
+sdk_version: 5.17.1
 app_file: app.py
 pinned: false
 license: cc-by-nc-4.0

app.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import os
+import cv2
+import torch
+import numpy as np
+import gradio as gr
+import spaces
+import sys
+import os
+import socket
+import webbrowser
+sys.path.append('vggt/')
+import shutil
+from datetime import datetime
+from demo_hf import demo_fn
+from omegaconf import DictConfig, OmegaConf
+import glob
+import gc
+import time
+from viser_fn import viser_wrapper
+def get_free_port():
+    """Get a free port using socket."""
+    # return 80
+    # return 8080
+    # return 10088 # for debugging
+    # return 7860
+    # return 7888
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        port = s.getsockname()[1]
+        return port
+@spaces.GPU(duration=240)
+def vggt_demo(
+    input_video,
+    input_image,
+):
+    start_time = time.time()
+    gc.collect()
+    torch.cuda.empty_cache()
+    debug = False
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    target_dir = f"input_images_{timestamp}"
+    if os.path.exists(target_dir):
+        shutil.rmtree(target_dir)
+    os.makedirs(target_dir)
+    target_dir_images = target_dir + "/images"
+    os.makedirs(target_dir_images)
+    if input_video is not None:
+        if not isinstance(input_video, str):
+            input_video = input_video["video"]["path"]
+    cfg_file = "config/base.yaml"
+    cfg = OmegaConf.load(cfg_file)
+    if input_image is not None:
+        input_image = sorted(input_image)
+        # recon_num = len(input_image)
+        # Copy files to the new directory
+        for file_name in input_image:
+            shutil.copy(file_name, target_dir_images)
+    elif input_video is not None:
+        vs = cv2.VideoCapture(input_video)
+        fps = vs.get(cv2.CAP_PROP_FPS)
+        frame_rate = 1
+        frame_interval = int(fps * frame_rate)
+        video_frame_num = 0
+        count = 0
+        while True:
+            (gotit, frame) = vs.read()
+            count +=1
+            if not gotit:
+                break
+            if count % frame_interval == 0:
+                cv2.imwrite(target_dir_images+"/"+f"{video_frame_num:06}.png", frame)
+                video_frame_num+=1
+        # recon_num = video_frame_num
+        # if recon_num<3:
+            # return None, "Please input at least three frames"
+    else:
+        return None, "Uploading not finished or Incorrect input format"
+    print(f"Files have been copied to {target_dir_images}")
+    cfg.SCENE_DIR = target_dir
+    predictions = demo_fn(cfg)
+    # Get a free port for viser
+    viser_port = get_free_port()
+    # Start viser visualization in a separate thread/process
+    viser_wrapper(predictions, port=viser_port)
+    del predictions
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(input_image)
+    print(input_video)
+    end_time = time.time()
+    execution_time = end_time - start_time
+    print(f"Execution time: {execution_time} seconds")
+    # Return None for the 3D model (since we're using viser) and the viser URL
+    # viser_url = f"Viser visualization is ready at: http://localhost:{viser_port}"
+    # print(viser_url)  # Debug print
+    return None, viser_port
+statue_video = "examples/videos/statue_video.mp4"
+apple_video = "examples/videos/apple_video.mp4"
+british_museum_video = "examples/videos/british_museum_video.mp4"
+cake_video = "examples/videos/cake_video.mp4"
+bonsai_video = "examples/videos/bonsai_video.mp4"
+face_video =  "examples/videos/in2n_face_video.mp4"
+counter_video =  "examples/videos/in2n_counter_video.mp4"
+horns_video = "examples/videos/llff_horns_video.mp4"
+person_video = "examples/videos/in2n_person_video.mp4"
+flower_video = "examples/videos/llff_flower_video.mp4"
+fern_video = "examples/videos/llff_fern_video.mp4"
+drums_video = "examples/videos/drums_video.mp4"
+kitchen_video = "examples/videos/kitchen_video.mp4"
+###########################################################################################
+apple_images = glob.glob(f'examples/apple/images/*')
+bonsai_images = glob.glob(f'examples/bonsai/images/*')
+cake_images = glob.glob(f'examples/cake/images/*')
+british_museum_images = glob.glob(f'examples/british_museum/images/*')
+face_images = glob.glob(f'examples/in2n_face/images/*')
+counter_images = glob.glob(f'examples/in2n_counter/images/*')
+horns_images = glob.glob(f'examples/llff_horns/images/*')
+person_images = glob.glob(f'examples/in2n_person/images/*')
+flower_images = glob.glob(f'examples/llff_flower/images/*')
+fern_images = glob.glob(f'examples/llff_fern/images/*')
+statue_images = glob.glob(f'examples/statue/images/*')
+drums_images = glob.glob(f'examples/drums/images/*')
+kitchen_images = glob.glob(f'examples/kitchen/images/*')
+###########################################################################################
+with gr.Blocks() as demo:
+    gr.Markdown("""
+    # 🏛️ VGGT: Visual Geometry Grounded Transformer
+    <div style="font-size: 16px; line-height: 1.2;">
+    Alpha version (testing).
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_video = gr.Video(label="Upload Video", interactive=True)
+            input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
+        with gr.Column(scale=3):
+            viser_output = gr.HTML(
+                label="Viser Visualization",
+                value='''<div style="height: 520px; border: 1px solid #e0e0e0;
+                                    border-radius: 4px; padding: 16px;
+                                    display: flex; align-items: center;
+                                    justify-content: center">
+                            3D Reconstruction (Point Cloud and Camera Poses; Zoom in to see details)
+                        </div>'''
+            )
+            log_output = gr.Textbox(label="Log")
+    with gr.Row():
+        submit_btn = gr.Button("Reconstruct", scale=1)
+        clear_btn = gr.ClearButton([input_video, input_images, viser_output, log_output], scale=1) #Modified viser_output
+    examples = [
+        [flower_video, flower_images],
+        [kitchen_video, kitchen_images],
+        # [person_video, person_images],
+        # [statue_video, statue_images],
+        # [drums_video, drums_images],
+        [counter_video, counter_images],
+        [fern_video, fern_images],
+        [horns_video, horns_images],
+        # [apple_video, apple_images],
+        # [bonsai_video, bonsai_images],
+    ]
+    def process_example(video, images):
+        """Wrapper function to ensure outputs are properly captured"""
+        model_output, log = vggt_demo(video, images)
+        # viser_wrapper(predictions, port=log)
+        # Get the hostname - use the actual hostname or IP where the server is running
+        # hostname = socket.gethostname()
+        # Extract port from log
+        port = log
+        # Create the viser URL using the hostname
+        # viser_url = f"http://{hostname}:{port}"
+        viser_url = f"http://localhost:{log}"
+        print(f"Viser URL: {viser_url}")
+        # Create the iframe HTML code.  Set width and height appropriately.
+        iframe_code = f'<iframe src="{viser_url}" width="100%" height="520px"></iframe>'
+        # Return the iframe code to update the gr.HTML component
+        return iframe_code, f"Visualization running at {viser_url}"
+    # TODO: move the selection of port outside of the demo function
+    # so that we can cache examples
+    gr.Examples(examples=examples,
+                inputs=[input_video, input_images],
+                outputs=[viser_output, log_output],  # Output to viser_output
+                fn=process_example,  # Use our wrapper function
+                cache_examples=False,
+                examples_per_page=50,
+                )
+    submit_btn.click(
+        process_example,  # Use the same wrapper function
+        [input_video, input_images],
+        [viser_output, log_output], # Output to viser_output
+        # concurrency_limit=1
+    )
+    # demo.launch(debug=True, share=True)
+    # demo.launch(server_name="0.0.0.0", server_port=8082, debug=True, share=False)
+    # demo.queue(max_size=20).launch(show_error=True, share=True)
+    demo.queue(max_size=20).launch(show_error=True) #, share=True, server_port=7888, server_name="0.0.0.0")
+    # demo.queue(max_size=20, concurrency_count=1).launch(debug=True, share=True)
+########################################################################################################################

clean_app.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import os
+import cv2
+import torch
+import numpy as np
+import gradio as gr
+import sys
+import os
+import socket
+import webbrowser
+sys.path.append('vggt/')
+import shutil
+from datetime import datetime
+from demo_hf import demo_fn
+from omegaconf import DictConfig, OmegaConf
+import glob
+import gc
+import time
+from viser_fn import viser_wrapper
+def get_free_port():
+    """Get a free port using socket."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        port = s.getsockname()[1]
+        return port
+def vggt_demo(
+    input_video,
+    input_image,
+):
+    start_time = time.time()
+    gc.collect()
+    torch.cuda.empty_cache()
+    debug = False
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    target_dir = f"input_images_{timestamp}"
+    if os.path.exists(target_dir):
+        shutil.rmtree(target_dir)
+    os.makedirs(target_dir)
+    target_dir_images = target_dir + "/images"
+    os.makedirs(target_dir_images)
+    if input_video is not None:
+        if not isinstance(input_video, str):
+            input_video = input_video["video"]["path"]
+    cfg_file = "config/base.yaml"
+    cfg = OmegaConf.load(cfg_file)
+    if input_image is not None:
+        input_image = sorted(input_image)
+        # recon_num = len(input_image)
+        # Copy files to the new directory
+        for file_name in input_image:
+            shutil.copy(file_name, target_dir_images)
+    elif input_video is not None:
+        vs = cv2.VideoCapture(input_video)
+        fps = vs.get(cv2.CAP_PROP_FPS)
+        frame_rate = 1
+        frame_interval = int(fps * frame_rate)
+        video_frame_num = 0
+        count = 0
+        while True:
+            (gotit, frame) = vs.read()
+            count +=1
+            if not gotit:
+                break
+            if count % frame_interval == 0:
+                cv2.imwrite(target_dir_images+"/"+f"{video_frame_num:06}.png", frame)
+                video_frame_num+=1
+    else:
+        return None, "Uploading not finished or Incorrect input format"
+    print(f"Files have been copied to {target_dir_images}")
+    cfg.SCENE_DIR = target_dir
+    predictions = demo_fn(cfg)
+    # Get a free port for viser
+    viser_port = get_free_port()
+    # Start viser visualization in a separate thread/process
+    viser_wrapper(predictions, port=viser_port)
+    del predictions
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(input_image)
+    print(input_video)
+    end_time = time.time()
+    execution_time = end_time - start_time
+    print(f"Execution time: {execution_time} seconds")
+    return None, viser_port
+statue_video = "examples/videos/statue_video.mp4"
+apple_video = "examples/videos/apple_video.mp4"
+british_museum_video = "examples/videos/british_museum_video.mp4"
+cake_video = "examples/videos/cake_video.mp4"
+bonsai_video = "examples/videos/bonsai_video.mp4"
+face_video =  "examples/videos/in2n_face_video.mp4"
+counter_video =  "examples/videos/in2n_counter_video.mp4"
+horns_video = "examples/videos/llff_horns_video.mp4"
+person_video = "examples/videos/in2n_person_video.mp4"
+flower_video = "examples/videos/llff_flower_video.mp4"
+fern_video = "examples/videos/llff_fern_video.mp4"
+drums_video = "examples/videos/drums_video.mp4"
+kitchen_video = "examples/videos/kitchen_video.mp4"
+###########################################################################################
+apple_images = glob.glob(f'examples/apple/images/*')
+bonsai_images = glob.glob(f'examples/bonsai/images/*')
+cake_images = glob.glob(f'examples/cake/images/*')
+british_museum_images = glob.glob(f'examples/british_museum/images/*')
+face_images = glob.glob(f'examples/in2n_face/images/*')
+counter_images = glob.glob(f'examples/in2n_counter/images/*')
+horns_images = glob.glob(f'examples/llff_horns/images/*')
+person_images = glob.glob(f'examples/in2n_person/images/*')
+flower_images = glob.glob(f'examples/llff_flower/images/*')
+fern_images = glob.glob(f'examples/llff_fern/images/*')
+statue_images = glob.glob(f'examples/statue/images/*')
+drums_images = glob.glob(f'examples/drums/images/*')
+kitchen_images = glob.glob(f'examples/kitchen/images/*')
+###########################################################################################
+with gr.Blocks() as demo:
+    gr.Markdown("""
+    # 🏛️ VGGT: Visual Geometry Grounded Transformer
+    <div style="font-size: 16px; line-height: 1.2;">
+    Alpha version (testing).
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_video = gr.Video(label="Upload Video", interactive=True)
+            input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
+        with gr.Column(scale=3):
+            viser_output = gr.HTML(
+                label="Viser Visualization",
+                value='''<div style="height: 520px; border: 1px solid #e0e0e0;
+                                    border-radius: 4px; padding: 16px;
+                                    display: flex; align-items: center;
+                                    justify-content: center">
+                            3D Reconstruction (Point Cloud and Camera Poses; Zoom in to see details)
+                        </div>'''
+            )
+            log_output = gr.Textbox(label="Log")
+    with gr.Row():
+        submit_btn = gr.Button("Reconstruct", scale=1)
+        clear_btn = gr.ClearButton([input_video, input_images, viser_output, log_output], scale=1) #Modified viser_output
+    examples = [
+        [flower_video, flower_images],
+        [kitchen_video, kitchen_images],
+        [counter_video, counter_images],
+        [fern_video, fern_images],
+        [horns_video, horns_images],
+    ]
+    def process_example(video, images):
+        """Wrapper function to ensure outputs are properly captured"""
+        model_output, log = vggt_demo(video, images)
+        viser_url = f"http://localhost:{log}"
+        print(f"Viser URL: {viser_url}")
+        # Create the iframe HTML code.  Set width and height appropriately.
+        iframe_code = f'<iframe src="{viser_url}" width="100%" height="520px"></iframe>'
+        return iframe_code, f"Visualization running at {viser_url}"
+    gr.Examples(examples=examples,
+                inputs=[input_video, input_images],
+                outputs=[viser_output, log_output],  # Output to viser_output
+                fn=process_example,  # Use our wrapper function
+                cache_examples=False,
+                examples_per_page=50,
+                )
+    submit_btn.click(
+        process_example,  # Use the same wrapper function
+        [input_video, input_images],
+        [viser_output, log_output], # Output to viser_output
+        concurrency_limit=1
+    )
+    demo.queue(max_size=20).launch(show_error=True, share=True, server_port=7888, server_name="0.0.0.0")

config/base.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+SCENE_DIR: examples/apple/
+# examples/llff_horns_single/
+# apple
+# cake
+_target_: vggt.models.vggt.VGGT #off3d.models.vggt.vggt.VGGT
+num_register_tokens: 4  # 0 for no register tokens
+ffn_layer: "mlp"
+qk_norm: False # NOTE: is this correct?
+patch_size: 14
+init_values: 0.01
+AGGREGATOR:
+  _target_: vggt.models.aggregator.Aggregator
+  patch_embed_by_conv: False
+  image_size: 518
+  use_checkpoint: True
+  use_reentrant: False
+  decoder_load_dino: False
+  backbone_qk_norm: False
+  aa_block_kwargs:
+      dim: 1024
+      num_heads: 16
+      mlp_ratio: 4
+      qkv_bias: True
+      proj_bias: True
+      ffn_bias: True
+      drop: 0.0
+      attn_drop: 0.0
+      init_values: 0.01
+      drop_path: 0.0
+      fused_attn: True
+      qk_norm: True
+      rope_freq: 100
+CameraHead:
+  _target_: vggt.heads.camera_head.CameraHead #off3d.models.vggt.camera_head.CameraHead
+  pose_encoding_type:  "absT_quaR_FoV"
+  new_trunk: True
+  trunk_depth: 4
+  # proj_dim: 768
+  qk_norm: True
+  init_values: 0.01
+  act_dict:
+    trans_act: "linear"
+    quat_act: "linear"
+    fl_act: "linear"
+  loss_kwargs:
+    loss_type: "l1"
+    gamma: 0.6
+PointHead:
+  _target_: vggt.heads.dpt_head.DPTHead #off3d.models.vggt.dpt_head.DPTHead
+  # _target_: off3d.models.vggt.linear_head.LinearHead
+  dim_in: 2048
+  shallow_conv: False
+  normalize_act: "inv_log"
+  pos_embed: True
+  loss_kwargs:
+    gradient_loss: "normal"
+    # gradient_loss: "grad"
+    normalize_pred: False
+    valid_range: 0.98
+    gamma: 1.0
+    camera_centric_reg: -1.0
+    all_mean: True
+DepthHead: null
+  # _target_: vggt.heads.dpt_head.DPTHead #off3d.models.vggt.dpt_head.DPTHead
+  # # _target_: off3d.models.vggt.linear_head.LinearHead
+  # dim_in: 2048
+  # patch_size: ${patch_size}
+  # output_dim: 2
+  # normalize_act: "exp"  # or just relu?
+  # normalize_act_conf: "expp1"
+  # pos_embed: True
+  # loss_kwargs:
+  #   loss_type: "conf"
+  #   predict_disparity: False  # or True
+  #   gradient_loss: "grad"
+  #   valid_range: 0.98
+  #   gamma: 1.0
+  #   all_mean: True
+MatchHead: null
+TrackHead: null
+hydra:
+  output_subdir: NULL
+  run:
+    dir: .

demo_hf.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import hydra
+import torch
+import os
+from hydra.utils import instantiate
+from omegaconf import DictConfig
+from PIL import Image
+from torchvision import transforms as TF
+import glob
+from vggt.utils.pose_enc import pose_encoding_to_extri_intri
+from viser_fn import viser_wrapper
+# @hydra.main(config_path="config", config_name="base")
+def demo_fn(cfg: DictConfig) -> None:
+    print(cfg)
+    model = instantiate(cfg, _recursive_=False)
+    if not torch.cuda.is_available():
+        raise ValueError("CUDA is not available. Check your environment.")
+    device = "cuda"
+    model = model.to(device)
+    _VGGT_URL = "https://huggingface.co/facebook/vggt_alpha/resolve/main/vggt_alpha_v0.pt"
+    # Reload model
+    pretrain_model = torch.hub.load_state_dict_from_url(_VGGT_URL)
+    if "model" in pretrain_model:
+        model_dict = pretrain_model["model"]
+        model.load_state_dict(model_dict, strict=False)
+    else:
+        model.load_state_dict(pretrain_model, strict=True)
+    # batch = torch.load("/fsx-repligen/jianyuan/cvpr2025_ckpts/batch.pth")
+    # y_hat_raw = torch.load("/fsx-repligen/jianyuan/cvpr2025_ckpts/y_hat.pth")
+    image_list = glob.glob(os.path.join(cfg.SCENE_DIR, "images", "*"))
+    image_list = sorted(image_list)
+    images = load_and_preprocess_images(image_list)
+    images = images[None].to(device)
+    batch = {"images": images}
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(dtype=torch.float16):
+            y_hat = model(batch)
+    last_pred_pose_enc = y_hat["pred_extrinsic_list"][-1]
+    pose_encoding_type = cfg.CameraHead.pose_encoding_type
+    last_pred_extrinsic, _ = pose_encoding_to_extri_intri(last_pred_pose_enc.detach(), None, pose_encoding_type=pose_encoding_type, build_intrinsics=False)
+    y_hat["last_pred_extrinsic"] = last_pred_extrinsic
+    for key in y_hat.keys():
+        if isinstance(y_hat[key], torch.Tensor):
+            y_hat[key] = y_hat[key].cpu().numpy()
+    return y_hat
+def load_and_preprocess_images(image_path_list):
+    # Check for empty list
+    if len(image_path_list) == 0:
+        raise ValueError("At least 1 image is required")
+    # 1. load images as RGB
+    # 2. resize images to (518, X, 3), where X is the resized width and X should be divisible by 14
+    # 3. normalize images to (0, 1)
+    # 4. concatenate images to (N, 3, 518, X), where N is the number of images
+    images = []
+    shapes = set()
+    to_tensor = TF.ToTensor()
+    # First process all images and collect their shapes
+    for image_path in image_path_list:
+        img = Image.open(image_path).convert("RGB")
+        width, height = img.size
+        new_width = 518
+        # Calculate height maintaining aspect ratio, divisible by 14
+        new_height = round(height * (new_width / width) / 14) * 14
+        # Resize with new dimensions (width, height)
+        img = img.resize((new_width, new_height), Image.Resampling.BICUBIC)
+        img = to_tensor(img)  # Convert to tensor (0, 1)
+        # Center crop height if it's larger than 518
+        if new_height > 518:
+            start_y = (new_height - 518) // 2
+            img = img[:, start_y:start_y + 518, :]
+        shapes.add((img.shape[1], img.shape[2]))
+        images.append(img)
+    # Check if we have different shapes
+    if len(shapes) > 1:
+        print(f"Warning: Found images with different shapes: {shapes}")
+        # Find maximum dimensions
+        max_height = max(shape[0] for shape in shapes)
+        max_width = max(shape[1] for shape in shapes)
+        # Pad images if necessary
+        padded_images = []
+        for img in images:
+            h_padding = max_height - img.shape[1]
+            w_padding = max_width - img.shape[2]
+            if h_padding > 0 or w_padding > 0:
+                pad_top = h_padding // 2
+                pad_bottom = h_padding - pad_top
+                pad_left = w_padding // 2
+                pad_right = w_padding - pad_left
+                img = torch.nn.functional.pad(
+                    img,
+                    (pad_left, pad_right, pad_top, pad_bottom),
+                    mode='constant',
+                    value=1.0
+                )
+            padded_images.append(img)
+        images = padded_images
+    images = torch.stack(images)  # concatenate images
+    # Ensure correct shape when single image
+    if len(image_path_list) == 1:
+        # Verify shape is (1, C, H, W)
+        if images.dim() == 3:
+            images = images.unsqueeze(0)
+    return images
+# if __name__ == "__main__":
+#     y_hat = demo_fn()
+#     # viser_wrapper(y_hat, port=8080)

gradio_util.py ADDED Viewed

	@@ -0,0 +1,297 @@

+try:
+    import os
+    import trimesh
+    import open3d as o3d
+    import gradio as gr
+    import numpy as np
+    import matplotlib
+    from scipy.spatial.transform import Rotation
+    print("Successfully imported the packages for Gradio visualization")
+except:
+    print(
+        f"Failed to import packages for Gradio visualization. Please disable gradio visualization"
+    )
+def visualize_by_gradio(glbfile):
+    """
+    Set up and launch a Gradio interface to visualize a GLB file.
+    Args:
+        glbfile (str): Path to the GLB file to be visualized.
+    """
+    def load_glb_file(glb_path):
+        # Check if the file exists and return the path or error message
+        if os.path.exists(glb_path):
+            return glb_path, "3D Model Loaded Successfully"
+        else:
+            return None, "File not found"
+    # Load the GLB file initially to check if it's valid
+    initial_model, log_message = load_glb_file(glbfile)
+    # Create the Gradio interface
+    with gr.Blocks() as demo:
+        gr.Markdown("# GLB File Viewer")
+        # 3D Model viewer component
+        model_viewer = gr.Model3D(
+            label="3D Model Viewer", height=600, value=initial_model
+        )
+        # Textbox for log output
+        log_output = gr.Textbox(label="Log", lines=2, value=log_message)
+    # Launch the Gradio interface
+    demo.launch(share=True)
+def vggsfm_predictions_to_glb(predictions) -> trimesh.Scene:
+    """
+    Converts VGG SFM predictions to a 3D scene represented as a GLB.
+    Args:
+        predictions (dict): A dictionary containing model predictions.
+    Returns:
+        trimesh.Scene: A 3D scene object.
+    """
+    # Convert predictions to numpy arrays
+    vertices_3d = predictions["points3D"].cpu().numpy()
+    colors_rgb = (predictions["points3D_rgb"].cpu().numpy() * 255).astype(
+        np.uint8
+    )
+    if True:
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(vertices_3d)
+        pcd.colors = o3d.utility.Vector3dVector(colors_rgb)
+        cl, ind = pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=1.0)
+        filtered_pcd = pcd.select_by_index(ind)
+        print(f"Filter out {len(vertices_3d) - len(filtered_pcd.points)} 3D points")
+        vertices_3d = np.asarray(filtered_pcd.points)
+        colors_rgb = np.asarray(filtered_pcd.colors).astype(np.uint8)
+    camera_matrices = predictions["extrinsics_opencv"].cpu().numpy()
+    # Calculate the 5th and 95th percentiles along each axis
+    lower_percentile = np.percentile(vertices_3d, 5, axis=0)
+    upper_percentile = np.percentile(vertices_3d, 95, axis=0)
+    # Calculate the diagonal length of the percentile bounding box
+    scene_scale = np.linalg.norm(upper_percentile - lower_percentile)
+    colormap = matplotlib.colormaps.get_cmap("gist_rainbow")
+    # Initialize a 3D scene
+    scene_3d = trimesh.Scene()
+    # Add point cloud data to the scene
+    point_cloud_data = trimesh.PointCloud(
+        vertices=vertices_3d, colors=colors_rgb
+    )
+    scene_3d.add_geometry(point_cloud_data)
+    # Prepare 4x4 matrices for camera extrinsics
+    num_cameras = len(camera_matrices)
+    extrinsics_matrices = np.zeros((num_cameras, 4, 4))
+    extrinsics_matrices[:, :3, :4] = camera_matrices
+    extrinsics_matrices[:, 3, 3] = 1
+    # Add camera models to the scene
+    for i in range(num_cameras):
+        world_to_camera = extrinsics_matrices[i]
+        camera_to_world = np.linalg.inv(world_to_camera)
+        rgba_color = colormap(i / num_cameras)
+        current_color = tuple(int(255 * x) for x in rgba_color[:3])
+        integrate_camera_into_scene(
+            scene_3d, camera_to_world, current_color, scene_scale
+        )
+    # Align scene to the observation of the first camera
+    scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
+    return scene_3d
+def apply_scene_alignment(
+    scene_3d: trimesh.Scene, extrinsics_matrices: np.ndarray
+) -> trimesh.Scene:
+    """
+    Aligns the 3D scene based on the extrinsics of the first camera.
+    Args:
+        scene_3d (trimesh.Scene): The 3D scene to be aligned.
+        extrinsics_matrices (np.ndarray): Camera extrinsic matrices.
+    Returns:
+        trimesh.Scene: Aligned 3D scene.
+    """
+    # Set transformations for scene alignment
+    opengl_conversion_matrix = get_opengl_conversion_matrix()
+    # Rotation matrix for alignment (180 degrees around the y-axis)
+    align_rotation = np.eye(4)
+    align_rotation[:3, :3] = Rotation.from_euler(
+        "y", 180, degrees=True
+    ).as_matrix()
+    # Apply transformation
+    initial_transformation = (
+        np.linalg.inv(extrinsics_matrices[0])
+        @ opengl_conversion_matrix
+        @ align_rotation
+    )
+    scene_3d.apply_transform(initial_transformation)
+    return scene_3d
+def integrate_camera_into_scene(
+    scene: trimesh.Scene,
+    transform: np.ndarray,
+    face_colors: tuple,
+    scene_scale: float,
+):
+    """
+    Integrates a fake camera mesh into the 3D scene.
+    Args:
+        scene (trimesh.Scene): The 3D scene to add the camera model.
+        transform (np.ndarray): Transformation matrix for camera positioning.
+        face_colors (tuple): Color of the camera face.
+        scene_scale (float): Scale of the scene.
+    """
+    cam_width = scene_scale * 0.05
+    cam_height = scene_scale * 0.1
+    # Create cone shape for camera
+    rot_45_degree = np.eye(4)
+    rot_45_degree[:3, :3] = Rotation.from_euler(
+        "z", 45, degrees=True
+    ).as_matrix()
+    rot_45_degree[2, 3] = -cam_height
+    opengl_transform = get_opengl_conversion_matrix()
+    # Combine transformations
+    complete_transform = transform @ opengl_transform @ rot_45_degree
+    camera_cone_shape = trimesh.creation.cone(cam_width, cam_height, sections=4)
+    # Generate mesh for the camera
+    slight_rotation = np.eye(4)
+    slight_rotation[:3, :3] = Rotation.from_euler(
+        "z", 2, degrees=True
+    ).as_matrix()
+    vertices_combined = np.concatenate(
+        [
+            camera_cone_shape.vertices,
+            0.95 * camera_cone_shape.vertices,
+            transform_points(slight_rotation, camera_cone_shape.vertices),
+        ]
+    )
+    vertices_transformed = transform_points(
+        complete_transform, vertices_combined
+    )
+    mesh_faces = compute_camera_faces(camera_cone_shape)
+    # Add the camera mesh to the scene
+    camera_mesh = trimesh.Trimesh(
+        vertices=vertices_transformed, faces=mesh_faces
+    )
+    camera_mesh.visual.face_colors[:, :3] = face_colors
+    scene.add_geometry(camera_mesh)
+def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
+    """
+    Computes the faces for the camera mesh.
+    Args:
+        cone_shape (trimesh.Trimesh): The shape of the camera cone.
+    Returns:
+        np.ndarray: Array of faces for the camera mesh.
+    """
+    # Create pseudo cameras
+    faces_list = []
+    num_vertices_cone = len(cone_shape.vertices)
+    for face in cone_shape.faces:
+        if 0 in face:
+            continue
+        v1, v2, v3 = face
+        v1_offset, v2_offset, v3_offset = face + num_vertices_cone
+        v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices_cone
+        faces_list.extend(
+            [
+                (v1, v2, v2_offset),
+                (v1, v1_offset, v3),
+                (v3_offset, v2, v3),
+                (v1, v2, v2_offset_2),
+                (v1, v1_offset_2, v3),
+                (v3_offset_2, v2, v3),
+            ]
+        )
+    faces_list += [(v3, v2, v1) for v1, v2, v3 in faces_list]
+    return np.array(faces_list)
+def transform_points(
+    transformation: np.ndarray, points: np.ndarray, dim: int = None
+) -> np.ndarray:
+    """
+    Applies a 4x4 transformation to a set of points.
+    Args:
+        transformation (np.ndarray): Transformation matrix.
+        points (np.ndarray): Points to be transformed.
+        dim (int, optional): Dimension for reshaping the result.
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    points = np.asarray(points)
+    initial_shape = points.shape[:-1]
+    dim = dim or points.shape[-1]
+    # Apply transformation
+    transformation = transformation.swapaxes(
+        -1, -2
+    )  # Transpose the transformation matrix
+    points = points @ transformation[..., :-1, :] + transformation[..., -1:, :]
+    # Reshape the result
+    result = points[..., :dim].reshape(*initial_shape, dim)
+    return result
+def get_opengl_conversion_matrix() -> np.ndarray:
+    """
+    Constructs and returns the OpenGL conversion matrix.
+    Returns:
+        numpy.ndarray: A 4x4 OpenGL conversion matrix.
+    """
+    # Create an identity matrix
+    matrix = np.identity(4)
+    # Flip the y and z axes
+    matrix[1, 1] = -1
+    matrix[2, 2] = -1
+    return matrix

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+torch==2.4.0
+torchvision==0.19.0
+hydra-core==1.3.2
+scipy
+omegaconf
+opencv-python
+einops
+numpy==1.26.3
+viser
+# accelerate==0.24.0
+# git+https://github.com/cvg/LightGlue.git#egg=LightGlue
+# pycolmap==0.6.1
+# https://huggingface.co/facebook/VGGSfM/resolve/main/poselib-2.0.2-cp310-cp310-linux_x86_64.whl
+# trimesh
+# open3d
+# hydra-core==1.3.2
+# scipy
+# omegaconf
+# opencv-python
+# einops
+# numpy==1.26.3
+# trimesh
+# open3d

vggt/heads/camera_head.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from hydra.utils import instantiate
+from vggt.layers.block import Block
+from vggt.layers import Mlp
+from vggt.heads.utils import PoseEmbedding
+from vggt.heads.head_act import activate_pose
+def modulate(x, shift, scale):
+    # modified from https://github.com/facebookresearch/DiT/blob/796c29e532f47bba17c5b9c5eb39b9354b8b7c64/models.py#L19
+    return x * (1 + scale) + shift
+class CameraHead(nn.Module):
+    def __init__(
+        self,
+        dim_in=2048,
+        patch_size=14,
+        qk_norm=False,
+        trunk_depth=4,
+        new_trunk=True,
+        update_new_trunk_tokens=False,
+        pose_encoding_type="absT_quaR_FoV",
+        proj_dim=-1,
+        num_heads=16,
+        mlp_ratio=4,
+        init_values=None,
+        act_dict=None,
+        **kwargs,
+    ):
+        super().__init__()
+        #  Three types:
+        # 1. Linear projection
+        # 2. New trunk
+        # 3. Old trunk
+        self.new_trunk = new_trunk
+        if pose_encoding_type=="absT_quaR_FoV":
+            self.target_dim = 9
+        elif pose_encoding_type=="absT_quaR_OneFLM1":
+            self.target_dim = 8
+        else:
+            raise ValueError(f"Unsupported pose encoding type: {pose_encoding_type}")
+        self.update_new_trunk_tokens = update_new_trunk_tokens
+        self.act_dict = act_dict
+        self.trunk_depth = trunk_depth
+        self.token_norm = nn.LayerNorm(dim_in)
+        if proj_dim > 0:
+            self.proj = nn.Linear(dim_in, proj_dim)
+            dim_in = proj_dim
+        else:
+            self.proj = nn.Identity()
+        if self.trunk_depth <0:
+            self.pose_branch = nn.Linear(dim_in, self.target_dim)
+        else:
+            self.trunk = nn.Sequential(
+                *[
+                    Block(
+                        dim=dim_in,
+                        num_heads=num_heads,
+                        mlp_ratio=mlp_ratio,
+                        qk_norm=qk_norm,
+                        init_values=init_values,
+                    )
+                    for _ in range(trunk_depth)
+                ]
+            )
+            self.trunk_norm = nn.LayerNorm(dim_in)
+            if self.new_trunk:
+                # TODO: self.empty_pose_tokens -> BxSxC
+                self.empty_pose_tokens = nn.Parameter(torch.zeros(1, 1, self.target_dim))
+                self.embed_pose = nn.Linear(self.target_dim, dim_in)
+                self.poseLN_modulation = nn.Sequential(
+                    nn.SiLU(),
+                    nn.Linear(dim_in, 3 * dim_in, bias=True)
+                )
+                self.adaln_norm = nn.LayerNorm(dim_in, elementwise_affine=False, eps=1e-6)
+                self.pose_branch = Mlp(
+                    in_features=dim_in,
+                    hidden_features=dim_in // 2,
+                    out_features=self.target_dim,
+                    drop=0,
+                )
+            else:
+                self.ffeat_norm = nn.LayerNorm(dim_in)
+                self.pose_branch = Mlp(
+                    in_features=dim_in,
+                    hidden_features=dim_in * 2,
+                    out_features=dim_in + self.target_dim,
+                    drop=0,
+                )
+                self.ffeat_updater = nn.Sequential(
+                    nn.Linear(dim_in, dim_in), nn.GELU()
+                )
+                # sine and cosine embed for camera parameters
+                self.embed_pose = PoseEmbedding(
+                    target_dim=self.target_dim,
+                    n_harmonic_functions=(dim_in // self.target_dim) // 2,
+                    append_input=False,
+                )
+                self.embed_pose_proj = nn.Linear(self.embed_pose.out_dim, dim_in)
+    def forward(self, aggregated_tokens_list, batch, patch_start_idx, iters=4,):
+        """
+        """
+        tokens = aggregated_tokens_list[-1]
+        # only use the Pose token for camera prediction
+        pose_tokens = tokens[:, :, 0]
+        pose_tokens = self.token_norm(pose_tokens)
+        pose_tokens = self.proj(pose_tokens)
+        B, S, C = pose_tokens.shape
+        if self.trunk_depth < 0:
+            pred_pose_enc = self.pose_branch(pose_tokens)
+            pred_pose_enc_list = [activate_pose(pred_pose_enc, **self.act_dict)]
+        elif self.new_trunk:
+            pred_pose_enc_list = self.new_trunk_fn(pose_tokens, iters)
+        else:
+            pred_pose_enc_list = self.old_trunk_fn(pose_tokens, iters)
+        # TODO add act here
+        return pred_pose_enc_list
+    def new_trunk_fn(self, pose_tokens, iters):
+        B, S, C = pose_tokens.shape
+        pred_pose_enc = None
+        pose_tokens_init = pose_tokens.clone()
+        pred_pose_enc_list = []
+        for iter_num in range(iters):
+            if pred_pose_enc is None:
+                # model_input = self.empty_representation BxSxC
+                module_input = self.embed_pose(self.empty_pose_tokens.expand(B, S, -1))
+            else:
+                pred_pose_enc = pred_pose_enc.detach()
+                module_input = self.embed_pose(pred_pose_enc)
+            shift_msa, scale_msa, gate_msa = self.poseLN_modulation(module_input).chunk(3, dim=-1)
+            pose_tokens_modulated = gate_msa * modulate(self.adaln_norm(pose_tokens), shift_msa, scale_msa)
+            pose_tokens_modulated = pose_tokens_modulated + pose_tokens
+            pose_tokens_modulated = self.trunk(pose_tokens_modulated)
+            pred_pose_enc_delta = self.pose_branch(self.trunk_norm(pose_tokens_modulated))
+            if pred_pose_enc is None:
+                pred_pose_enc = pred_pose_enc_delta
+            else:
+                pred_pose_enc = pred_pose_enc + pred_pose_enc_delta
+            if self.update_new_trunk_tokens:
+                pose_tokens = pose_tokens_modulated + pose_tokens_init
+            pred_pose_enc_list.append(activate_pose(pred_pose_enc, **self.act_dict))
+        return pred_pose_enc_list
+    def old_trunk_fn(self, pose_tokens, iters):
+        B, S, C = pose_tokens.shape
+        pred_pose_enc = torch.zeros(B, S, self.target_dim).to(
+            pose_tokens.device
+        )
+        pose_tokens_init = pose_tokens.clone()
+        pred_pose_enc_list = []
+        for iter_num in range(iters):
+            pred_pose_enc = pred_pose_enc.detach()
+            # Embed the camera parameters and add to pose_tokens
+            pose_embed = self.embed_pose_proj(self.embed_pose(pred_pose_enc))
+            pose_tokens = pose_tokens + pose_embed
+            # Run trunk transformers on pose_tokens
+            pose_tokens = self.trunk(pose_tokens)
+            # Predict the delta feat and pose encoding at each iteration
+            delta = self.pose_branch(self.trunk_norm(pose_tokens))
+            delta_pred_pose_enc = delta[..., : self.target_dim]
+            delta_feat = delta[..., self.target_dim :]
+            pose_tokens = self.ffeat_updater(self.ffeat_norm(delta_feat)) + pose_tokens
+            pred_pose_enc = pred_pose_enc + delta_pred_pose_enc
+            pose_tokens = (pose_tokens + pose_tokens_init) / 2
+            pred_pose_enc_list.append(activate_pose(pred_pose_enc, **self.act_dict))
+        return pred_pose_enc_list

vggt/heads/dpt_head.py ADDED Viewed

	@@ -0,0 +1,521 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# linear head implementation for DUST3R
+# --------------------------------------------------------
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .head_act import activate_head
+from .utils import normalized_view_plane_uv, HarmonicEmbedding, position_grid_to_embed
+class DPTHead(nn.Module):
+    """
+    """
+    def __init__(self,
+                    dim_in,
+                    patch_size = 14,
+                    output_dim = 4,
+                    normalize_act="inv_log",
+                    normalize_act_conf = "expp1",
+                    features=256,
+                    use_bn=False,
+                    use_clstoken=False,
+                    out_channels=[256, 512, 1024, 1024],
+                    intermediate_layer_idx=[4, 11, 17, 23],
+                    shared_norm = True,
+                    add_rgb = False,
+                    head_use_checkpoint=False,
+                    groups=1,
+                    shallow_conv=False,
+                    load_da_str=None,
+                    dpt_layer_norm=False,
+                    pos_embed = False,
+                    feature_only = False,
+                    down_ratio = 1,
+                    **kwargs,
+                 ):
+        super(DPTHead, self).__init__()
+        in_channels = dim_in
+        self.add_rgb = add_rgb
+        self.patch_size = patch_size
+        self.intermediate_layer_idx = intermediate_layer_idx
+        self.shared_norm = shared_norm
+        self.normalize_act = normalize_act
+        self.normalize_act_conf = normalize_act_conf
+        self.head_use_checkpoint = head_use_checkpoint
+        self.pos_embed = pos_embed
+        self.feature_only = feature_only
+        self.down_ratio = down_ratio
+        # if self.pos_embed:
+        #     self.pose_embed_fn_64 = HarmonicEmbedding(n_harmonic_functions=64, omega_0=1.0, logspace=True, append_input=False)
+        #     self.pose_embed_fn_128 = HarmonicEmbedding(n_harmonic_functions=128, omega_0=1.0, logspace=True, append_input=False)
+        #     self.pose_embed_fn_256 = HarmonicEmbedding(n_harmonic_functions=256, omega_0=1.0, logspace=True, append_input=False)
+        #     self.pose_embed_fn_512 = HarmonicEmbedding(n_harmonic_functions=512, omega_0=1.0, logspace=True, append_input=False)
+        #     self.pose_embed_fn_1024 = HarmonicEmbedding(n_harmonic_functions=1024, omega_0=1.0, logspace=True, append_input=False)
+        if self.shared_norm:
+            self.norm = nn.LayerNorm(in_channels)
+        else:
+            self.norm = nn.ModuleList([nn.LayerNorm(in_channels) for _ in range(len(self.intermediate_layer_idx))])
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            raise ValueError("CLS token is not supported for DPT head Now")
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn, groups=groups, shallow_conv=shallow_conv, dpt_layer_norm=dpt_layer_norm)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn, groups=groups, shallow_conv=shallow_conv, dpt_layer_norm=dpt_layer_norm)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn, groups=groups, shallow_conv=shallow_conv, dpt_layer_norm=dpt_layer_norm)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn, has_residual=False, groups=groups, shallow_conv=shallow_conv, dpt_layer_norm=dpt_layer_norm)
+        head_features_1 = features
+        head_features_2 = 32
+        if not self.feature_only:
+            self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+            conv2_in_channels = head_features_1 // 2 + 3 * int(self.add_rgb)
+            if dpt_layer_norm:
+                self.scratch.output_conv2 = nn.Sequential(
+                    ChannelLayerNorm(conv2_in_channels),
+                    nn.Conv2d(conv2_in_channels, head_features_2, kernel_size=3, stride=1, padding=1),
+                    nn.ReLU(True),
+                    ChannelLayerNorm(head_features_2),
+                    nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+                    # nn.ReLU(True),
+                    # nn.Identity(),
+                )
+            else:
+                self.scratch.output_conv2 = nn.Sequential(
+                    nn.Conv2d(conv2_in_channels, head_features_2, kernel_size=3, stride=1, padding=1),
+                    nn.ReLU(True),
+                    nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+                    # nn.ReLU(True),
+                    # nn.Identity(),
+                )
+        else:
+            self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1)
+        if load_da_str is not None:
+            from off3d.utils.train_utils import remove_if_not_match
+            da_path = os.path.join(torch.hub.get_dir(), load_da_str)
+            da_model = torch.load(da_path)
+            to_load_dict = {}
+            for k in da_model.keys():
+                if "depth_head" in k:
+                    to_load_dict[k.replace("depth_head.", "")] = da_model[k]
+            all_keys = list(to_load_dict.keys())
+            model_state_dict = self.state_dict()
+            for cur_key in all_keys:
+                to_load_dict = remove_if_not_match(model_state_dict, to_load_dict, cur_key)
+            missing, unexpected = self.load_state_dict(to_load_dict, strict=False)
+            print("Missing keys in DPT head: ", missing)
+            print("Unexpected keys in DPT head: ", unexpected)
+            for layer in self.scratch.output_conv2:
+                if isinstance(layer, (nn.Conv2d, nn.Linear)):
+                    layer.weight.data *= 0.1
+                    layer.bias.data *= 0.1
+    def forward(self, aggregated_tokens_list, batch, patch_start_idx):
+        B, _, _, H, W = batch["images"].shape
+        S = aggregated_tokens_list[0].shape[1]
+        patch_h, patch_w = H // self.patch_size, W // self.patch_size
+        # TODO use rgb as input for the DPT head
+        out = []
+        dpt_idx = 0
+        for layer_idx in self.intermediate_layer_idx:
+            if self.use_clstoken:
+                raise NotImplementedError("CLS token is not supported for DPT head Now")
+            x = aggregated_tokens_list[layer_idx][:, :, patch_start_idx:]
+            x = x.view(B*S, -1, x.shape[-1])
+            if self.shared_norm:
+                x = self.norm(x)
+            else:
+                x = self.norm[dpt_idx](x)
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            if self.head_use_checkpoint:
+                # e.g., from Bx2048xpatch_h*patch_w to Bx256xpatch_h*patch_w
+                x = torch.utils.checkpoint.checkpoint(self.projects[dpt_idx], x, use_reentrant=False)
+                if self.pos_embed:
+                    x = self._apply_pos_embed(x, W, H)
+                x = torch.utils.checkpoint.checkpoint(self.resize_layers[dpt_idx], x, use_reentrant=False)
+            else:
+                x = self.projects[dpt_idx](x)
+                if self.pos_embed:
+                    x = self._apply_pos_embed(x, W, H)
+                x = self.resize_layers[dpt_idx](x)
+            out.append(x)
+            dpt_idx += 1
+        if self.head_use_checkpoint:
+            out = torch.utils.checkpoint.checkpoint(self.scratch_forward, out, use_reentrant=False)
+        else:
+            out = self.scratch_forward(out)
+        # out = F.interpolate(out, (int(patch_h * self.patch_size), int(patch_w * self.patch_size)), mode="bilinear", align_corners=True)
+        out = custom_interpolate(out, (int(patch_h * self.patch_size / self.down_ratio), int(patch_w * self.patch_size / self.down_ratio)), mode="bilinear", align_corners=True)
+        if self.pos_embed:
+            out = self._apply_pos_embed(out, W, H)
+        if self.feature_only:
+            return out
+        if self.add_rgb:
+            # NOTE batch["images"] is in the range of [0, 1]
+            out = torch.cat([out, batch["images"].view(B*S, 3, H, W).clip(0, 1)], dim=1)
+        if self.head_use_checkpoint:
+            out = torch.utils.checkpoint.checkpoint(self.scratch.output_conv2, out, use_reentrant=False)
+        else:
+            out = self.scratch.output_conv2(out)
+        preds, conf = activate_head(out, normalize_act=self.normalize_act, normalize_act_conf=self.normalize_act_conf)
+        # back to B, S
+        # B, S, H, W, 3
+        preds = preds.view(B, S, *preds.shape[1:])
+        # B, S, H, W
+        conf = conf.view(B, S, *conf.shape[1:])
+        return preds, conf
+    def _apply_pos_embed(self, x, W, H, ratio=0.1):
+        """Apply positional embedding to the input tensor."""
+        patch_w = x.shape[-1]
+        patch_h = x.shape[-2]
+        pos_embed = normalized_view_plane_uv(patch_w, patch_h, aspect_ratio=W/H, dtype=x.dtype, device=x.device)
+        pos_embed = position_grid_to_embed(pos_embed, x.shape[1])
+        pos_embed = pos_embed * ratio
+        pos_embed = pos_embed.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        return x + pos_embed
+    def scratch_forward(self, out):
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1) # layer_1:[32, 256, 148, 148]
+        layer_2_rn = self.scratch.layer2_rn(layer_2) # layer_2:[32, 512, 74, 74]
+        layer_3_rn = self.scratch.layer3_rn(layer_3) # layer_3:[32, 1024, 37, 37]
+        layer_4_rn = self.scratch.layer4_rn(layer_4) # layer_4:[32, 1024, 19, 19]
+        out = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        del layer_4_rn, layer_4
+        out = self.scratch.refinenet3(out, layer_3_rn, size=layer_2_rn.shape[2:])
+        del layer_3_rn, layer_3
+        out = self.scratch.refinenet2(out, layer_2_rn, size=layer_1_rn.shape[2:])
+        del layer_2_rn, layer_2
+        out = self.scratch.refinenet1(out, layer_1_rn)
+        del layer_1_rn, layer_1
+        out = self.scratch.output_conv1(out)
+        return out
+################################################################################
+# Modules
+def _make_fusion_block(features, use_bn, size=None, has_residual=True, groups=1, shallow_conv=False, dpt_layer_norm=False):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(True),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+        has_residual=has_residual,
+        groups=groups,
+        shallow_conv=shallow_conv,
+        dpt_layer_norm=dpt_layer_norm,
+    )
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn, groups=1, shallow_conv=False, dpt_layer_norm=False):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups=groups
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.shallow_conv = shallow_conv
+        if not self.shallow_conv:
+            self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        # if self.bn == True:
+        #     self.bn1 = nn.BatchNorm2d(features)
+        #     self.bn2 = nn.BatchNorm2d(features)
+        # elif dpt_layer_norm == :
+        if dpt_layer_norm:
+            self.norm1 = ChannelLayerNorm(features)
+            self.norm2 = ChannelLayerNorm(features)
+        else:
+            self.norm1  = None
+            self.norm2 = None
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.norm1 is not None:
+            out = self.norm1(out)
+        if not self.shallow_conv:
+            out = self.activation(out)
+            out = self.conv2(out)
+            if self.norm2 is not None:
+                out = self.norm2(out)
+        # if self.groups > 1:
+        #     out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None,
+        has_residual=True,
+        groups=1,
+        shallow_conv=False,
+        dpt_layer_norm=False,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups=groups
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=self.groups)
+        if has_residual:
+            self.resConfUnit1 = ResidualConvUnit(features, activation, bn, groups=self.groups, shallow_conv=shallow_conv, dpt_layer_norm=dpt_layer_norm)
+        self.has_residual = has_residual
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=self.groups, shallow_conv=shallow_conv, dpt_layer_norm=dpt_layer_norm)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size=size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if self.has_residual:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        # output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = custom_interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+def custom_interpolate(x, size=None, scale_factor=None, mode="bilinear", align_corners=True):
+    if size is None:
+        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
+    INT_MAX = 1610612736
+    input_elements = size[0] * size[1] * x.shape[0] * x.shape[1]
+    if input_elements > INT_MAX:
+        # Split x into chunks along the batch dimension
+        chunks = torch.chunk(x, chunks=(input_elements // INT_MAX) + 1, dim=0)
+        interpolated_chunks = [nn.functional.interpolate(chunk, size=size, mode=mode, align_corners=align_corners) for chunk in chunks]
+        x = torch.cat(interpolated_chunks, dim=0)
+        return x.contiguous()
+    else:
+        return nn.functional.interpolate(x, size=size, mode=mode, align_corners=align_corners)
+class ChannelLayerNorm(nn.Module):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.ln = nn.LayerNorm(num_channels)
+    def forward(self, x):
+        # x: [N, C, H, W]
+        x = x.permute(0, 2, 3, 1)     # -> [N, H, W, C]
+        x = self.ln(x)               # now LN sees 'C' as the last dimension
+        x = x.permute(0, 3, 1, 2)    # -> [N, C, H, W]
+        return x

vggt/heads/head_act.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# post process function for all heads: extract 3D points/confidence from output
+# --------------------------------------------------------
+import torch
+import torch.nn.functional as F
+def activate_pose(pred_pose_enc, trans_act="linear", quat_act="linear", fl_act="linear"):
+    T = pred_pose_enc[..., :3]
+    quat = pred_pose_enc[..., 3:7]
+    fl = pred_pose_enc[..., 7:] # or fov
+    T = base_pose_act(T, trans_act)
+    quat = base_pose_act(quat, quat_act)
+    fl = base_pose_act(fl, fl_act) # or fov
+    pred_pose_enc = torch.cat([T, quat, fl], dim=-1)
+    return pred_pose_enc
+def base_pose_act(pose_enc, act_type="linear"):
+    if act_type == "linear":
+        return pose_enc
+    elif act_type == "inv_log":
+        return inverse_log_transform(pose_enc)
+    elif act_type == "exp":
+        return torch.exp(pose_enc)
+    elif act_type == "relu":
+        return F.relu(pose_enc)
+    else:
+        raise ValueError(f"Unknown act_type: {act_type}")
+def activate_head(out, normalize_act="norm_exp", normalize_act_conf="expp1"):
+    """
+    """
+    # Move channels from last dim to the 4th dimension => (B, H, W, C)
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W, C expected
+    # Split into xyz (first C-1 channels) and confidence (last channel)
+    xyz = fmap[:, :, :, :-1]
+    conf = fmap[:, :, :, -1]
+    if normalize_act == "norm_exp":
+        # 1) distance d = ||xyz||
+        # 2) normalize xyz => xyz / d
+        # 3) multiply by torch.expm1(d)
+        d = xyz.norm(dim=-1, keepdim=True).clamp(min=1e-8)
+        xyz_normed = xyz / d
+        pts3d = xyz_normed * torch.expm1(d)
+    elif normalize_act == "norm":
+        pts3d = xyz / xyz.norm(dim=-1, keepdim=True)
+    elif normalize_act == "exp":
+        pts3d = torch.exp(xyz)
+    elif normalize_act == "relu":
+        pts3d = F.relu(xyz)
+    elif normalize_act == "inv_log":
+        pts3d = inverse_log_transform(xyz)
+    elif normalize_act == "xy_inv_log":
+        xy, z = xyz.split([2, 1], dim=-1)
+        z = inverse_log_transform(z)
+        pts3d = torch.cat([xy * z, z], dim=-1)
+    elif normalize_act == "sigmoid":
+        pts3d = torch.sigmoid(xyz)
+    elif normalize_act == "linear":
+        pts3d = xyz
+    else:
+        raise ValueError(f"Unknown normalize_act: {normalize_act}")
+    # reg_dense_conf for mode='exp', with vmin=1, vmax=inf
+    # => conf_out = 1 + e^(conf)
+    # (since clip(max=vmax - vmin) with vmax=inf basically doesn’t limit anything)
+    if normalize_act_conf == "expp1":
+        conf_out = 1 + conf.exp()
+    elif normalize_act_conf == "expp0":
+        conf_out = conf.exp()
+    elif normalize_act_conf == "sigmoid":
+        conf_out = torch.sigmoid(conf)
+    else:
+        raise ValueError(f"Unknown normalize_act_conf: {normalize_act_conf}")
+    # Final dictionary
+    return pts3d, conf_out
+def inverse_log_transform(y):
+    return torch.sign(y) * (torch.expm1(torch.abs(y)))

vggt/heads/track_head.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# linear head implementation for DUST3R
+# --------------------------------------------------------
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .head_act import activate_head
+from .utils import normalized_view_plane_uv, HarmonicEmbedding, position_grid_to_embed
+from .dpt_head import DPTHead
+from .match_head import MatchHead
+from ..track_modules.base_track_predictor import BaseTrackerPredictor
+from ..track_modules.base_track_predictor_v2 import BaseTrackerPredictorV2
+EPS = 1e-6
+def reduce_masked_mean(x, mask, dim=None, keepdim=False):
+    # x and mask are the same shape, or at least broadcastably so
+    # returns shape-1
+    # axis can be a list of axes
+    for a, b in zip(x.size(), mask.size()):
+        assert a == b  # some shape mismatch!
+    prod = x * mask
+    if dim is None:
+        numer = torch.sum(prod)
+        denom = EPS + torch.sum(mask)
+    else:
+        numer = torch.sum(prod, dim=dim, keepdim=keepdim)
+        denom = EPS + torch.sum(mask, dim=dim, keepdim=keepdim)
+    mean = numer / denom
+    return mean
+def balanced_ce_loss(pred, gt, valid=None):
+    """Balanced cross entropy loss.
+    pred: predicted scores
+    gt: binary ground truth
+    valid: validity mask
+    """
+    # pred and gt are the same shape
+    for a, b in zip(pred.size(), gt.size()):
+        assert a == b  # some shape mismatch!
+    if valid is not None:
+        for a, b in zip(pred.size(), valid.size()):
+            assert a == b  # some shape mismatch!
+    else:
+        valid = torch.ones_like(gt)
+    pos = (gt > 0.95).float()
+    neg = (gt < 0.05).float()
+    label = pos * 2.0 - 1.0
+    a = -label * pred
+    b = F.relu(a)
+    loss = b + torch.log(torch.exp(-b) + torch.exp(a - b))
+    pos_loss = reduce_masked_mean(loss, pos * valid)
+    neg_loss = reduce_masked_mean(loss, neg * valid)
+    balanced_loss = pos_loss + neg_loss
+    return balanced_loss, loss
+def sequence_loss(flow_preds, flow_gt, vis, valids, gamma=0.8, vis_aware=False, huber=False, delta=10, vis_aware_w=0.1, **kwargs):
+    """Loss function defined over sequence of flow predictions"""
+    B, S, N, D = flow_gt.shape
+    assert D == 2
+    B, S1, N = vis.shape
+    B, S2, N = valids.shape
+    assert S == S1
+    assert S == S2
+    n_predictions = len(flow_preds)
+    flow_loss = 0.0
+    for i in range(n_predictions):
+        i_weight = gamma ** (n_predictions - i - 1)
+        flow_pred = flow_preds[i]
+        i_loss = (flow_pred - flow_gt).abs()  # B, S, N, 2
+        i_loss = torch.mean(i_loss, dim=3) # B, S, N
+        # Combine valids and vis for per-frame valid masking.
+        combined_mask = torch.logical_and(valids, vis)
+        # valids * vis.float() # B, S, N
+        # vis_aware weighting.  Apply BEFORE reduce_masked_mean
+        if vis_aware:
+            combined_mask = combined_mask.float() * (1.0 + vis_aware_w)  # Add, don't add to the mask itself.
+            # combined_mask = torch.clamp(combined_mask, 0.0, 1.0) # No need to clamp.
+            # Apply the mask *before* taking the mean.
+            # i_loss = i_loss * combined_mask
+            # flow_loss += i_weight * i_loss.mean()
+            flow_loss += i_weight * reduce_masked_mean(i_loss, combined_mask)
+        else:
+            if combined_mask.numel() > 10:
+                # flow_loss += i_weight * i_loss.mean()
+                i_loss = i_loss[combined_mask]
+                flow_loss += i_weight * i_loss.mean()
+            else:
+                flow_loss += 0
+        # # Handle the case where no points are valid.
+        # if combined_mask.sum() > 0:
+        #     flow_loss += i_weight * reduce_masked_mean(i_loss, combined_mask)  # Pass combined_mask
+        # else:  No valid points, so this term contributes 0 to the loss.
+        #     flow_loss += 0.  (This is implicit)
+    # Avoid division by zero if n_predictions is 0 (though it shouldn't be).
+    if n_predictions > 0:
+        flow_loss = flow_loss / n_predictions
+    return flow_loss
+class TrackHead(nn.Module):
+    """
+    Track head that uses DPT/Match head to process tokens and BaseTrackerPredictor for tracking.
+    """
+    def __init__(self,
+                 dim_in,
+                 patch_size=16,
+                 features=128,
+                 feature_extractor_type="dpt",  # or "match"
+                 train_query_points=128,
+                 feature_extractor_kwargs={},
+                 tracker_kwargs={},
+                 loss_kwargs={},
+                 iters=4,
+                 use_base_tracker_v2=False,
+                 predict_conf=False,
+                 random_query_points = None,
+                 **kwargs):
+        super().__init__()
+        self.patch_size = patch_size
+        self.feature_extractor_type = feature_extractor_type
+        self.train_query_points = train_query_points
+        self.random_query_points = random_query_points
+        # Initialize feature extractor (DPT or Match head)
+        if feature_extractor_type == "dpt":
+            self.feature_extractor = DPTHead(
+                dim_in=dim_in,
+                patch_size=patch_size,
+                features=features,
+                feature_only=True,  # Only output features, no activation
+                **feature_extractor_kwargs
+            )
+        elif feature_extractor_type == "match":
+            raise NotImplementedError("Match head is not implemented for track head")
+            self.feature_extractor = MatchHead(
+                dim_in=dim_in,
+                patch_size=patch_size,
+                features=features,
+                **feature_extractor_kwargs
+            )
+        else:
+            raise ValueError(f"Unknown feature_extractor_type: {feature_extractor_type}")
+        # Initialize tracker
+        if use_base_tracker_v2:
+            self.tracker = BaseTrackerPredictorV2(
+                latent_dim=features,  # Match the output_dim of feature extractor
+                predict_conf=predict_conf,
+                **tracker_kwargs
+            )
+        else:
+            self.tracker = BaseTrackerPredictor(
+                latent_dim=features,  # Match the output_dim of feature extractor
+                predict_conf=predict_conf,
+                **tracker_kwargs
+            )
+        self.loss_kwargs = loss_kwargs
+        self.iters = iters
+    def _compute_losses(self, coord_preds, vis_scores, conf_scores, batch):
+        """Compute tracking losses using sequence_loss"""
+        gt_tracks = batch["tracks"]  # B, S, N, 2
+        gt_track_vis_mask = batch["track_vis_mask"]  # B, S, N
+        # if self.training and hasattr(self, "train_query_points"):
+        train_query_points = coord_preds[-1].shape[2]
+        gt_tracks = gt_tracks[:, :, :train_query_points]
+        gt_track_vis_mask = gt_track_vis_mask[:, :, :train_query_points]
+        # Create validity mask that filters out tracks not visible in first frame
+        valids = torch.ones_like(gt_track_vis_mask)
+        mask = gt_track_vis_mask[:, 0, :] == True
+        valids = valids * mask.unsqueeze(1)
+        # Compute tracking loss using sequence_loss
+        track_loss = sequence_loss(
+            flow_preds=coord_preds,
+            flow_gt=gt_tracks,
+            vis=gt_track_vis_mask,
+            valids=valids,
+            **self.loss_kwargs
+        )
+        vis_loss = F.binary_cross_entropy_with_logits(vis_scores[valids], gt_track_vis_mask[valids].float())
+        # within 3 pixels
+        if conf_scores is not None:
+            gt_conf_mask = (gt_tracks - coord_preds[-1]).norm(dim=-1) < 3
+            conf_loss = F.binary_cross_entropy_with_logits(conf_scores[valids], gt_conf_mask[valids].float())
+        else:
+            conf_loss = 0
+        return track_loss, vis_loss, conf_loss
+    def forward(self, aggregated_tokens_list, batch, patch_start_idx):
+        B, S, _, H, W = batch["images"].shape
+        gt_tracks = batch["tracks"] # B, S, N, 2
+        # gt_track_vis_mask = batch["track_vis_mask"] # B, S, N
+        # Extract features using DPT/Match head
+        if self.feature_extractor_type == "dpt":
+            feature_maps = self.feature_extractor(aggregated_tokens_list, batch, patch_start_idx)
+        else:  # match head
+            feature_maps = self.feature_extractor(aggregated_tokens_list, batch, patch_start_idx)["descriptor"]
+        feature_maps = feature_maps.view(B, S, *feature_maps.shape[1:]).clone()
+        # Get query points from batch
+        query_points = gt_tracks[:, 0]  # Use first frame's points as query
+        if self.training:
+            if self.random_query_points is not None:
+                min_val = self.random_query_points[0]
+                max_val = self.random_query_points[1]
+                mu = max_val # Mean centered at the upper bound
+                sigma = (max_val - min_val) / 2.71 # Standard deviation, exp
+                train_query_points = int(random.gauss(mu, sigma))
+                train_query_points = max(min(train_query_points, max_val), min_val) # Clamp to ensure value is within range
+            else:
+                train_query_points = self.train_query_points
+            query_points = query_points[:, :train_query_points]
+        # Predict tracks using BaseTrackerPredictor
+        # coord_preds: a list of B, S, N, 2
+        # vis_scores: B, S, N
+        coord_preds, vis_scores, conf_scores = self.tracker(
+            query_points=query_points,
+            fmaps=feature_maps,
+            iters=self.iters,
+        )
+        # Calculate losses if in training mode
+        track_loss, vis_loss, conf_loss = self._compute_losses(coord_preds, vis_scores, conf_scores, batch)
+        loss_dict = {
+            "loss_track": track_loss,
+            "loss_vis": vis_loss,
+            "loss_track_conf": conf_loss,
+            "last_track_pred": coord_preds[-1],
+        }
+        return loss_dict

vggt/heads/utils.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import torch
+import torch.nn as nn
+from typing import Optional
+def make_sincos_pos_embed(
+    embed_dim: int, pos: torch.Tensor, omega_0: float = 100
+) -> torch.Tensor:
+    """
+    This function generates a 1D positional embedding from a given grid using sine and cosine functions.
+    Args:
+    - embed_dim: The embedding dimension.
+    - pos: The position to generate the embedding from.
+    Returns:
+    - emb: The generated 1D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, dtype=torch.double, device=pos.device)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / omega_0**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb.float()
+def position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100) -> torch.Tensor:
+    """
+    Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC)
+    Args:
+        pos_grid: Tensor of shape (H, W, 2) containing 2D coordinates
+        embed_dim: Output channel dimension for embeddings
+    Returns:
+        Tensor of shape (H, W, embed_dim) with positional embeddings
+    """
+    H, W, grid_dim = pos_grid.shape
+    assert grid_dim == 2
+    pos_flat = pos_grid.reshape(-1, grid_dim)  # Flatten to (H*W, 2)
+    # Process x and y coordinates separately
+    emb_x = make_sincos_pos_embed(embed_dim//2, pos_flat[:, 0], omega_0=omega_0)  # [1, H*W, D/2]
+    emb_y = make_sincos_pos_embed(embed_dim//2, pos_flat[:, 1], omega_0=omega_0)  # [1, H*W, D/2]
+    # Combine and reshape
+    emb = torch.cat([emb_x, emb_y], dim=-1)  # [1, H*W, D]
+    return emb.view(H, W, embed_dim)  # [H, W, D]
+class HarmonicEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        n_harmonic_functions: int = 6,
+        omega_0: float = 1.0,
+        logspace: bool = True,
+        append_input: bool = True,
+    ) -> None:
+        """
+        The harmonic embedding layer supports the classical
+        Nerf positional encoding described in
+        `NeRF <https://arxiv.org/abs/2003.08934>`_
+        and the integrated position encoding in
+        `MIP-NeRF <https://arxiv.org/abs/2103.13415>`_.
+        During the inference you can provide the extra argument `diag_cov`.
+        If `diag_cov is None`, it converts
+        rays parametrized with a `ray_bundle` to 3D points by
+        extending each ray according to the corresponding length.
+        Then it converts each feature
+        (i.e. vector along the last dimension) in `x`
+        into a series of harmonic features `embedding`,
+        where for each i in range(dim) the following are present
+        in embedding[...]::
+            [
+                sin(f_1*x[..., i]),
+                sin(f_2*x[..., i]),
+                ...
+                sin(f_N * x[..., i]),
+                cos(f_1*x[..., i]),
+                cos(f_2*x[..., i]),
+                ...
+                cos(f_N * x[..., i]),
+                x[..., i],              # only present if append_input is True.
+            ]
+        where N corresponds to `n_harmonic_functions-1`, and f_i is a scalar
+        denoting the i-th frequency of the harmonic embedding.
+        If `diag_cov is not None`, it approximates
+        conical frustums following a ray bundle as gaussians,
+        defined by x, the means of the gaussians and diag_cov,
+        the diagonal covariances.
+        Then it converts each gaussian
+        into a series of harmonic features `embedding`,
+        where for each i in range(dim) the following are present
+        in embedding[...]::
+            [
+                sin(f_1*x[..., i]) * exp(0.5 * f_1**2 * diag_cov[..., i,]),
+                sin(f_2*x[..., i]) * exp(0.5 * f_2**2 * diag_cov[..., i,]),
+                ...
+                sin(f_N * x[..., i]) * exp(0.5 * f_N**2 * diag_cov[..., i,]),
+                cos(f_1*x[..., i]) * exp(0.5 * f_1**2 * diag_cov[..., i,]),
+                cos(f_2*x[..., i]) * exp(0.5 * f_2**2 * diag_cov[..., i,]),,
+                ...
+                cos(f_N * x[..., i]) * exp(0.5 * f_N**2 * diag_cov[..., i,]),
+                x[..., i],              # only present if append_input is True.
+            ]
+        where N equals `n_harmonic_functions-1`, and f_i is a scalar
+        denoting the i-th frequency of the harmonic embedding.
+        If `logspace==True`, the frequencies `[f_1, ..., f_N]` are
+        powers of 2:
+            `f_1, ..., f_N = 2**torch.arange(n_harmonic_functions)`
+        If `logspace==False`, frequencies are linearly spaced between
+        `1.0` and `2**(n_harmonic_functions-1)`:
+            `f_1, ..., f_N = torch.linspace(
+                1.0, 2**(n_harmonic_functions-1), n_harmonic_functions
+            )`
+        Note that `x` is also premultiplied by the base frequency `omega_0`
+        before evaluating the harmonic functions.
+        Args:
+            n_harmonic_functions: int, number of harmonic
+                features
+            omega_0: float, base frequency
+            logspace: bool, Whether to space the frequencies in
+                logspace or linear space
+            append_input: bool, whether to concat the original
+                input to the harmonic embedding. If true the
+                output is of the form (embed.sin(), embed.cos(), x)
+        """
+        super().__init__()
+        if logspace:
+            frequencies = 2.0 ** torch.arange(
+                n_harmonic_functions, dtype=torch.float32
+            )
+        else:
+            frequencies = torch.linspace(
+                1.0,
+                2.0 ** (n_harmonic_functions - 1),
+                n_harmonic_functions,
+                dtype=torch.float32,
+            )
+        self.register_buffer(
+            "_frequencies", frequencies * omega_0, persistent=False
+        )
+        self.register_buffer(
+            "_zero_half_pi",
+            torch.tensor([0.0, 0.5 * torch.pi]),
+            persistent=False,
+        )
+        self.append_input = append_input
+    def forward(
+        self, x: torch.Tensor, diag_cov: Optional[torch.Tensor] = None, **kwargs
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: tensor of shape [..., dim]
+            diag_cov: An optional tensor of shape `(..., dim)`
+                representing the diagonal covariance matrices of our Gaussians, joined with x
+                as means of the Gaussians.
+        Returns:
+            embedding: a harmonic embedding of `x` of shape
+            [..., (n_harmonic_functions * 2 + int(append_input)) * num_points_per_ray]
+        """
+        # [..., dim, n_harmonic_functions]
+        embed = x[..., None] * self._frequencies
+        # [..., 1, dim, n_harmonic_functions] + [2, 1, 1] => [..., 2, dim, n_harmonic_functions]
+        embed = embed[..., None, :, :] + self._zero_half_pi[..., None, None]
+        # Use the trig identity cos(x) = sin(x + pi/2)
+        # and do one vectorized call to sin([x, x+pi/2]) instead of (sin(x), cos(x)).
+        embed = embed.sin()
+        if diag_cov is not None:
+            x_var = diag_cov[..., None] * torch.pow(self._frequencies, 2)
+            exp_var = torch.exp(-0.5 * x_var)
+            # [..., 2, dim, n_harmonic_functions]
+            embed = embed * exp_var[..., None, :, :]
+        embed = embed.reshape(*x.shape[:-1], -1)
+        if self.append_input:
+            return torch.cat([embed, x], dim=-1)
+        return embed
+    @staticmethod
+    def get_output_dim_static(
+        input_dims: int, n_harmonic_functions: int, append_input: bool
+    ) -> int:
+        """
+        Utility to help predict the shape of the output of `forward`.
+        Args:
+            input_dims: length of the last dimension of the input tensor
+            n_harmonic_functions: number of embedding frequencies
+            append_input: whether or not to concat the original
+                input to the harmonic embedding
+        Returns:
+            int: the length of the last dimension of the output tensor
+        """
+        return input_dims * (2 * n_harmonic_functions + int(append_input))
+    def get_output_dim(self, input_dims: int = 3) -> int:
+        """
+        Same as above. The default for input_dims is 3 for 3D applications
+        which use harmonic embedding for positional encoding,
+        so the input might be xyz.
+        """
+        return self.get_output_dim_static(
+            input_dims, len(self._frequencies), self.append_input
+        )
+class PoseEmbedding(nn.Module):
+    def __init__(self, target_dim, n_harmonic_functions=10, append_input=True):
+        super().__init__()
+        self._emb_pose = HarmonicEmbedding(
+            n_harmonic_functions=n_harmonic_functions, append_input=append_input
+        )
+        self.out_dim = self._emb_pose.get_output_dim(target_dim)
+    def forward(self, pose_encoding):
+        e_pose_encoding = self._emb_pose(pose_encoding)
+        return e_pose_encoding
+def random_mask_single_patch_vectorized(images, patch_size=(16, 16)):
+    """
+    Randomly masks a single patch in a batch of images using fully vectorized operations.
+    :param images: Tensor of shape [B, 3, H, W]
+    :param patch_size: Tuple (ph, pw), size of the patch to mask
+    """
+    B, C, H, W = images.shape
+    ph, pw = patch_size
+    # Generate random positions for the top-left corner of the patch
+    x_positions = torch.randint(0, W - pw, (B, 1, 1))
+    y_positions = torch.randint(0, H - ph, (B, 1, 1))
+    # Compute patch grid indices
+    patch_x = torch.arange(pw).reshape(1, 1, pw)
+    patch_y = torch.arange(ph).reshape(1, ph, 1)
+    # Broadcast patch indices to each position
+    x_indices = x_positions + patch_x
+    y_indices = y_positions + patch_y
+    # Expand the indices to cover all channels and all images in the batch
+    x_indices = x_indices.expand(B, ph, pw)
+    y_indices = y_indices.expand(B, ph, pw)
+    # Flatten the indices to apply the mask using advanced indexing
+    batch_indices = torch.arange(B).unsqueeze(-1).expand(B, ph * pw)
+    x_indices = x_indices.reshape(B, ph * pw)
+    y_indices = y_indices.reshape(B, ph * pw)
+    # Create a mask initialized to one and apply zero at the indices
+    mask = torch.ones_like(images)
+    mask[batch_indices, :, y_indices, x_indices] = 0
+    # Apply mask to images
+    return images * mask
+def normalized_view_plane_uv(width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None) -> torch.Tensor:
+    # borrowed from https://github.com/microsoft/moge
+    "UV with left-top corner as (-width / diagonal, -height / diagonal) and right-bottom corner as (width / diagonal, height / diagonal)"
+    if aspect_ratio is None:
+        aspect_ratio = width / height
+    span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
+    span_y = 1 / (1 + aspect_ratio ** 2) ** 0.5
+    u = torch.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype, device=device)
+    v = torch.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype, device=device)
+    u, v = torch.meshgrid(u, v, indexing='xy')
+    uv = torch.stack([u, v], dim=-1)
+    return uv

vggt/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

vggt/layers/attention.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+import torch.nn.functional as F
+logger = logging.getLogger("dinov2")
+# XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+# try:
+#     if XFORMERS_ENABLED:
+#         from xformers.ops import memory_efficient_attention, unbind
+#         XFORMERS_AVAILABLE = True
+#         warnings.warn("xFormers is available (Attention)")
+#     else:
+#         warnings.warn("xFormers is disabled (Attention)")
+#         raise ImportError
+# except ImportError:
+#     XFORMERS_AVAILABLE = False
+#     warnings.warn("xFormers is not available (Attention)")
+XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = True,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            proj_bias: bool = True,
+            norm_layer: nn.Module = nn.LayerNorm,
+            fused_attn: bool = True,
+            rope = None,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = fused_attn
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x: Tensor, pos=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rope is not None:
+            q = self.rope(q, pos)
+            k = self.rope(k, pos)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None, pos=None) -> Tensor:
+        assert pos is None
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

vggt/layers/block.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+# XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+# try:
+#     if XFORMERS_ENABLED:
+#         from xformers.ops import fmha, scaled_index_add, index_select_cat
+#         XFORMERS_AVAILABLE = True
+#         warnings.warn("xFormers is available (Block)")
+#     else:
+#         warnings.warn("xFormers is disabled (Block)")
+#         raise ImportError
+# except ImportError:
+#     XFORMERS_AVAILABLE = False
+#     warnings.warn("xFormers is not available (Block)")
+XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        fused_attn: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        rope_freq: int = -1,
+        rope = None,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            fused_attn=fused_attn,
+            rope=rope,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, pos=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                pos=pos,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, pos=pos))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x, pos=pos)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+    pos = None,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    if pos is not None:
+        pos = pos[brange]
+        residual = residual_func(x_subset, pos=pos)
+    else:
+        residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

vggt/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

vggt/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

vggt/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

vggt/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

vggt/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

vggt/layers/rope.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import numpy as np
+import torch
+class PositionGetter(object):
+    """ return positions of patches """
+    # NOTE this can take a lot of memory when the patch size is variable
+    def __init__(self):
+        self.cache_positions = {}
+    def __call__(self, b, h, w, device):
+        if not (h,w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+        pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+        return pos
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
+    """
+    grid_size: tuple (height, width) of the grid
+    return:
+    pos_embed: [grid_size[0]*grid_size[1], embed_dim] or [n_cls_token+grid_size[0]*grid_size[1], embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if n_cls_token>0:
+        pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    keys = ['enc_pos_embed']+(['dec_pos_embed'] if hasattr(model,'dec_blocks') else [])
+    img_size = model.patch_embed.img_size
+    if isinstance(img_size,int): img_size = (img_size,img_size)
+    for k in keys:
+        if not k in checkpoint_model: continue
+        pos_embed_checkpoint = checkpoint_model[k]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_extra_tokens = 0 # no cls token
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        new_size = (img_size[0]//model.patch_embed.patch_size[0],img_size[1]//model.patch_embed.patch_size[1])
+        if orig_size != new_size[0] or orig_size != new_size[1]:
+            print("Position interpolate %s from %dx%d to %dx%d" % (k, orig_size, orig_size, new_size[0], new_size[1]))
+            extra_tokens = pos_embed_checkpoint[:num_extra_tokens,:]
+            pos_tokens = pos_embed_checkpoint[num_extra_tokens:,:]
+            pos_tokens = pos_tokens.reshape(1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size[0], new_size[1]), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2).squeeze(0)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0)
+            checkpoint_model[k] = new_pos_embed.squeeze(0)
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+# borrowed from https://github.com/naver/dust3r
+# todo: replace with our official implementation
+class RoPE2D(torch.nn.Module):
+    def __init__(self, freq=100.0, F0=1.0):
+        super().__init__()
+        self.base = freq
+        self.F0 = F0
+        self.cache = {}
+    def get_cos_sin(self, D, seq_len, device, dtype):
+        if (D,seq_len,device,dtype) not in self.cache:
+            inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+            t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+            freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+            freqs = torch.cat((freqs, freqs), dim=-1)
+            cos = freqs.cos() # (Seq, Dim)
+            sin = freqs.sin()
+            self.cache[D,seq_len,device,dtype] = (cos,sin)
+        return self.cache[D,seq_len,device,dtype]
+    @staticmethod
+    def rotate_half(x):
+        x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rope1d(self, tokens, pos1d, cos, sin):
+        assert pos1d.ndim==2
+        cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+        sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+        return (tokens * cos) + (self.rotate_half(tokens) * sin)
+    def forward(self, tokens, positions):
+        """
+        input:
+            * tokens: batch_size x nheads x ntokens x dim
+            * positions: batch_size x ntokens x 2 (y and x position of each token)
+        output:
+            * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+        """
+        assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+        D = tokens.size(3) // 2
+        assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+        cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+        # split features into two along the feature dimension, and apply rope1d on each half
+        y, x = tokens.chunk(2, dim=-1)
+        y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+        x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+        tokens = torch.cat((y, x), dim=-1)
+        return tokens

vggt/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+# try:
+#     if XFORMERS_ENABLED:
+#         from xformers.ops import SwiGLU
+#         XFORMERS_AVAILABLE = True
+#         warnings.warn("xFormers is available (SwiGLU)")
+#     else:
+#         warnings.warn("xFormers is disabled (SwiGLU)")
+#         raise ImportError
+# except ImportError:
+SwiGLU = SwiGLUFFN
+XFORMERS_AVAILABLE = False
+# warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

vggt/layers/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,408 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from torch.nn.init import trunc_normal_
+from . import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+        qk_norm=False,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        # tricky but makes it work
+        self.use_checkpoint = False
+        #
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+                qk_norm=qk_norm,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint(blk, x, use_reentrant=self.use_reentrant)
+            else:
+                x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint(blk, x, use_reentrant=self.use_reentrant)
+            else:
+                x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=True, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

vggt/models/aggregator.py ADDED Viewed

	@@ -0,0 +1,473 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import pdb
+import math
+import numpy as np
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from hydra.utils import instantiate
+from torch.nn.init import trunc_normal_
+from torch.utils.checkpoint import checkpoint
+from omegaconf import OmegaConf
+from contextlib import nullcontext
+from typing import Any, Dict, List, Optional, Tuple, Union
+# from off3d.utils.train_utils import remove_if_not_match
+# from off3d.models.modules import AttnBlock, CrossAttnBlock, Mlp, ResidualBlock, RoPEAttnBlock
+# from vggsfm.models.utils import get_2d_sincos_pos_embed, get_1d_sincos_pos_embed_from_grid
+# from off3d.models.dino_layers import SwiGLUFFNFused, PatchEmbed
+from vggt.layers import SwiGLUFFNFused, PatchEmbed
+from vggt.layers.block import Block
+# from off3d.models.dino_layers.block import Block
+# from vggt.layers.rope import RoPE2D, PositionGetter
+from vggt.layers.rope import RoPE2D, PositionGetter
+# from off3d.models.multihead_with_qk_norm import MultiheadAttention_with_qk_norm
+# from off3d.models.rope import RoPEMulitheadAttention
+from vggt.layers.vision_transformer import vit_small, vit_base, vit_large, vit_giant2
+logger = logging.getLogger(__name__)
+_RESNET_MEAN = [0.485, 0.456, 0.406]
+_RESNET_STD = [0.229, 0.224, 0.225]
+class Aggregator(nn.Module):
+    def __init__(
+        self,
+        image_size = 512,
+        patch_size = 16,
+        num_register_tokens = 4,
+        image_backbone = "dinov2_vitl14_reg",
+        aa_block_size = 1,
+        aa_layer_size = 24,
+        aa_block_kwargs = Dict,
+        attn_block = Block,
+        aa_order = ["frame", "global"],
+        use_checkpoint = False,
+        use_reentrant = False,
+        use_dino_tokens = False,
+        use_patch_tokens_only = False,
+        freeze_dino=False,
+        freeze_dino_inter=False,
+        # pose_embed=False,
+        embed_type="no",
+        patch_embed_by_conv=False,
+        decoder_load_dino=False,
+        backbone_qk_norm=False,
+        **kwargs,
+    ):
+        super().__init__()
+        if image_backbone is None:
+            self.image_backbone = None
+        else:
+            self.__build_image_backbone__(image_backbone, image_size,
+                                          patch_size, num_register_tokens, freeze_dino=freeze_dino,
+                                          freeze_dino_inter=freeze_dino_inter, backbone_qk_norm=backbone_qk_norm)
+        self.freeze_dino = freeze_dino
+        if use_checkpoint and not freeze_dino:
+            self.image_backbone.use_checkpoint = True
+        else:
+            self.image_backbone.use_checkpoint = False
+        self.image_backbone.use_reentrant = use_reentrant
+        if aa_block_kwargs['rope_freq']>0:
+            self.rope = RoPE2D(freq=aa_block_kwargs['rope_freq'])
+            self.position_getter = PositionGetter()
+        else:
+            self.rope = None
+        frame_blocks_list = []
+        global_blocks_list = []
+        for _ in range(aa_layer_size):
+            frame_blocks_list.append(attn_block(**aa_block_kwargs, rope=self.rope))
+            global_blocks_list.append(attn_block(**aa_block_kwargs, rope=self.rope))
+        self.frame_blocks = nn.ModuleList(frame_blocks_list)
+        self.global_blocks = nn.ModuleList(global_blocks_list)
+        if "mlp" in embed_type:
+            self.register_mlp = nn.ModuleList([nn.Linear(aa_block_kwargs['dim'], aa_block_kwargs['dim']) for _ in range(aa_layer_size)])
+        self.aa_order = aa_order
+        self.aa_block_size = aa_block_size
+        self.aa_layer_size = aa_layer_size
+        assert self.aa_layer_size % self.aa_block_size == 0, "aa_layer_size must be divisible by aa_block_size"
+        self.aa_block_num = self.aa_layer_size // self.aa_block_size
+        self.patch_size = patch_size
+        self.use_checkpoint = use_checkpoint
+        self.use_reentrant = use_reentrant
+        self.use_dino_tokens = use_dino_tokens
+        self.use_patch_tokens_only = use_patch_tokens_only
+        # self.pose_embed = pose_embed
+        # self.register_embed = register_embed
+        self.embed_type = embed_type
+        if self.use_patch_tokens_only:
+            self.query_ref_token = nn.Parameter(torch.randn(1, 2, 1, aa_block_kwargs['dim']))
+            self.patch_start_idx = 0
+            nn.init.normal_(self.query_ref_token, std=1e-6)
+        elif self.use_dino_tokens:
+            # One for query frame and one for other frames
+            self.query_ref_token = nn.Parameter(torch.randn(1, 2, 1, aa_block_kwargs['dim']))
+            self.patch_start_idx = 1 + num_register_tokens + 1
+            nn.init.normal_(self.query_ref_token, std=1e-6)
+        else:
+            self.pose_token = nn.Parameter(torch.randn(1, 2, 1, aa_block_kwargs['dim']))
+            self.register_token = nn.Parameter(torch.randn(1, 2, num_register_tokens, aa_block_kwargs['dim']))
+            self.patch_start_idx = 1 + num_register_tokens
+            nn.init.normal_(self.pose_token, std=1e-6)
+            nn.init.normal_(self.register_token, std=1e-6)
+        if decoder_load_dino:
+            dinov2_weights = self.image_backbone.state_dict()
+            decoder_dinov2_weights = dino_to_aggregator(dinov2_weights)
+            missing_keys, unexpected_keys = self.load_state_dict(decoder_dinov2_weights, strict=False)
+            print(f"missing_keys for decoder_load_dino: {missing_keys}")
+            print(f"unexpected_keys for decoder_load_dino: {unexpected_keys}")
+        if patch_embed_by_conv:
+            self.image_backbone = self.image_backbone.patch_embed
+        for name, value in (
+            ("_resnet_mean", _RESNET_MEAN),
+            ("_resnet_std", _RESNET_STD),
+        ):
+            self.register_buffer(
+                name,
+                torch.FloatTensor(value).view(1, 1, 3, 1, 1),
+                persistent=False,
+            )
+    def __build_image_backbone__(self, image_backbone, image_size, patch_size, num_register_tokens,
+                                interpolate_antialias=True,
+                                interpolate_offset=0.0,
+                                block_chunks=0,
+                                init_values=1.0,
+                                freeze_dino=False,
+                                freeze_dino_inter=False,
+                                backbone_qk_norm=False,
+                                ):
+        vit_models = { "dinov2_vitl14_reg": vit_large,
+                      "dinov2_vitb14_reg": vit_base,
+                      "dinov2_vits14_reg": vit_small,
+                      "dinov2_vitg2_reg": vit_giant2,
+                      }
+        if image_backbone not in vit_models:
+            raise NotImplementedError
+        self.image_backbone = vit_models[image_backbone](img_size=image_size,
+                                patch_size=patch_size, num_register_tokens=num_register_tokens,
+                                interpolate_antialias=interpolate_antialias,
+                                interpolate_offset=interpolate_offset,
+                                block_chunks=block_chunks, init_values=init_values, qk_norm=backbone_qk_norm)
+        # pretrained_model = torch.hub.load("facebookresearch/dinov2", image_backbone)
+        # pretrained_model_dict = pretrained_model.state_dict()
+        # image_backbone_dict = self.image_backbone.state_dict()
+        # all_pretrained_keys = list(pretrained_model_dict.keys())
+        # for cur_key in all_pretrained_keys:
+            # pretrained_model_dict = remove_if_not_match(image_backbone_dict, pretrained_model_dict, cur_key)
+        # missing_keys, unexpected_keys = self.image_backbone.load_state_dict(pretrained_model_dict, strict=False)
+        self.image_backbone.mask_token.requires_grad_(False)
+        # self.image_backbone.freeze_dino = freeze_dino
+        # if freeze_dino:
+        #     print("Freezing DINO layers")
+        #     for name, param in self.image_backbone.named_parameters():
+        #         param.requires_grad_(False)
+        # if freeze_dino_inter:
+        #     print("Freezing DINO intermediate layers")
+        #     for name, param in self.image_backbone.named_parameters():
+        #         if name not in ['pos_embed', 'patch_embed.proj.weight']:
+        #             param.requires_grad_(False)
+        # print("Loading pretrained DINO v2 model: ")
+        # print(f"missing_keys: {missing_keys}")
+        # print("Loading pretrained DINO v2 model: ")
+        # print(f"unexpected_keys: {unexpected_keys}")
+    def forward(
+        self, images,
+        masks=None,
+        batch=None,
+    ):
+        """
+        TODO List:
+        """
+        # The input images are in the range of [0, 1]
+        B, S, C_in, H, W = images.shape
+        device = images.device
+        images = (images - self._resnet_mean) / self._resnet_std
+        if self.image_backbone is not None:
+            images = images.view(B * S, C_in, H, W)
+            with torch.no_grad() if self.freeze_dino else nullcontext():
+                backbone_output = self.image_backbone(images)
+            if isinstance(backbone_output, dict):
+                patch_tokens = backbone_output["x_norm_patchtokens"]
+            else:
+                patch_tokens = backbone_output
+            BS, P, C = patch_tokens.shape
+            if self.use_patch_tokens_only:
+                indicator_tokens = slice_expand_and_flatten(self.query_ref_token, B, S)
+                tokens = patch_tokens + indicator_tokens
+            elif self.use_dino_tokens:
+                dino_cls_token = backbone_output["x_norm_clstoken"][:, None] # BS, 1, C
+                dino_register_tokens = backbone_output["x_norm_regtokens"] # BS, num_register_tokens, C
+                indicator_tokens = slice_expand_and_flatten(self.query_ref_token, B, S)
+                tokens = torch.cat([dino_cls_token, dino_register_tokens, indicator_tokens, patch_tokens], dim=1)
+            else:
+                # B, S, P, C
+                pose_token = slice_expand_and_flatten(self.pose_token, B, S)
+                register_token = slice_expand_and_flatten(self.register_token, B, S)
+                tokens = torch.cat([pose_token, register_token, patch_tokens], dim=1)
+        else:
+            # well well I need to write this, hopefully in the near future
+            raise NotImplementedError
+        if self.rope is not None:
+            pos = self.position_getter(B*S, H//self.patch_size, W//self.patch_size, device=device)
+        else:
+            pos = None
+        if self.patch_start_idx > 0:
+            # shift the position by 1 so that the special tokens are at 0
+            pos = pos + 1
+            pos_special = torch.zeros(B*S, self.patch_start_idx, 2).to(device).to(pos.dtype)
+            pos = torch.cat([pos_special, pos], dim=1)
+        _, P, C = tokens.shape
+        frame_idx = 0
+        global_idx = 0
+        output_list = []
+        for aa_block_idx in range(self.aa_block_num):
+            for attn_type in self.aa_order:
+                if attn_type == "frame":
+                    tokens, frame_idx, frame_intermediates = self._process_frame_attention(
+                        tokens, B, S, P, C, frame_idx, self.aa_block_size, pos=pos
+                    )
+                elif attn_type == "global":
+                    tokens, global_idx, global_intermediates = self._process_global_attention(
+                        tokens, B, S, P, C, global_idx, self.aa_block_size, pos=pos
+                    )
+                else:
+                    raise ValueError(f"Unknown attention type: {attn_type}")
+            # for frame_inter, global_inter in zip(frame_intermediates, global_intermediates):
+            #     concat_inter = torch.cat([frame_inter, global_inter], dim=-1)  # [B x S x P x 2C]
+            #     output_list.append(concat_inter)
+            for i in range(len(frame_intermediates)):
+                # [B x S x P x 2C]
+                concat_inter = torch.cat([frame_intermediates[i], global_intermediates[i]], dim=-1)
+                output_list.append(concat_inter)
+        del concat_inter
+        del frame_intermediates
+        del global_intermediates
+        return output_list, None, self.patch_start_idx
+    def _process_frame_attention(self, tokens, B, S, P, C, frame_idx, num_blocks, pos=None):
+        """
+        Process frame attention blocks.
+        """
+        if tokens.shape != (B*S, P, C):
+            tokens = tokens.view(B, S, P, C)
+            tokens = tokens.view(B*S, P, C)
+        if pos is not None and pos.shape != (B*S, P, 2):
+            pos = pos.view(B, S, P, 2)
+            pos = pos.view(B*S, P, 2)
+        intermediates = []
+        for _ in range(num_blocks):
+            if self.use_checkpoint:
+                tokens = checkpoint(self.frame_blocks[frame_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.frame_blocks[frame_idx](tokens, pos=pos)
+            frame_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, frame_idx, intermediates
+    def _process_global_attention(self, tokens, B, S, P, C, global_idx, num_blocks, pos=None):
+        """
+        Process global attention blocks.
+        """
+        # pose_embed
+        if tokens.shape != (B, S*P, C):
+            tokens = tokens.view(B, S, P, C)
+        ############################################################
+        # Frame embedding
+        if "register" in self.embed_type:
+            embed_tokens = tokens[:, :, 1:2, ...].clone()
+        if "gauss" in self.embed_type:
+            embed_tokens = torch.randn((B, S, 1, C),device=tokens.device, dtype=tokens.dtype)
+        if self.embed_type != "no":
+            embed_tokens = F.normalize(embed_tokens, dim=-1)
+        if "mlp" in self.embed_type:
+            embed_tokens = self.register_mlp[global_idx](embed_tokens)
+        if "mlpnorm" in self.embed_type:
+            embed_tokens = F.normalize(embed_tokens, dim=-1)
+        if "all" in self.embed_type:
+            tokens = tokens + embed_tokens
+        elif "part" in self.embed_type:
+            tokens[:, :, self.patch_start_idx:] = tokens[:, :, self.patch_start_idx:] + embed_tokens
+        else:
+            assert self.embed_type == "no"
+        if "postnorm" in self.embed_type:
+            tokens = F.normalize(tokens, dim=-1)
+            # tokens = self.embed_norm(tokens)
+        ############################################################
+        tokens = tokens.view(B, S*P, C)
+        if pos is not None and pos.shape != (B, S*P, 2):
+            pos = pos.view(B, S, P, 2)
+            pos = pos.view(B, S*P, 2)
+        intermediates = []
+        for _ in range(num_blocks):
+            if self.use_checkpoint:
+                tokens = checkpoint(self.global_blocks[global_idx], tokens, pos, use_reentrant=self.use_reentrant)
+            else:
+                tokens = self.global_blocks[global_idx](tokens, pos=pos)
+            global_idx += 1
+            intermediates.append(tokens.view(B, S, P, C))
+        return tokens, global_idx, intermediates
+def slice_expand_and_flatten(token_tensor, B, S):
+    """
+    1) Takes the first token (index=0) and the remaining tokens (index=1..S-1).
+    2) Expands them along batch dimension B.
+    3) Concatenates along the time/sequence dimension => (B, S, ...).
+    4) Flattens the first two dims to produce => (B*S, ...).
+    Args:
+        token_tensor: a tensor expected to have shape (1, S, ...) or (some_batch, S, ...).
+                      We'll slice along dim=1.
+        B: batch size.
+        S: number of frames/time-steps.
+    Returns:
+        Flattened token tensor of shape (B*S, ...).
+    """
+    # Slice out the "query" tokens => shape (1, 1, ...)
+    query = token_tensor[:, 0:1, ...].expand(B, 1, *token_tensor.shape[2:])
+    # Slice out the "other" tokens => shape (1, S-1, ...)
+    others = token_tensor[:, 1:, ...].expand(B, S - 1, *token_tensor.shape[2:])
+    # Concatenate => shape (B, S, ...)
+    combined = torch.cat([query, others], dim=1)
+    # Finally flatten => shape (B*S, ...)
+    combined = combined.view(B * S, *combined.shape[2:])
+    return combined
+def dino_to_aggregator(dinov2_weights):
+    new_dinov2_weights = {}
+    for key, value in dinov2_weights.items():
+        if "blocks" in key:
+            for new_attn_key in ["frame_blocks", "global_blocks"]:
+                new_key = key.replace("blocks", new_attn_key)
+                # if 'attn' in key:
+                    # if "qkv.weight" in key:
+                    #     new_key = new_key.replace('qkv.weight', 'in_proj_weight')
+                    # elif "qkv.bias" in key:
+                    #     new_key = new_key.replace('qkv.bias', 'in_proj_bias')
+                    # elif 'proj.weight' in key:
+                    #     new_key = new_key.replace('proj.weight', 'out_proj.weight')
+                    # elif 'proj.bias' in key:
+                    #     new_key = new_key.replace('proj.bias', 'out_proj.bias')
+                new_dinov2_weights[new_key] = value.clone()
+    return new_dinov2_weights
+def remove_if_not_match(model_state_dict, state_dict, key):
+    if key in state_dict.keys() and key in model_state_dict.keys():
+        if state_dict[key].shape != model_state_dict[key].shape:
+            print(f"Warning: {key} shape mismatch, removing it")
+            del state_dict[key]
+    return state_dict

vggt/models/vggt.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+# from off3d.models.vggt.utils import random_mask_single_patch_vectorized  # Removed unused import
+from hydra.utils import instantiate
+# from .loss import *
+def configure_dict(module, **attributes):
+    if module:
+        for attr, value in attributes.items():
+            setattr(module, attr, value)
+class VGGT(nn.Module):
+    def __init__(self,
+                AGGREGATOR: Dict,
+                CameraHead: Dict,
+                PointHead: Dict,
+                DepthHead: Dict,
+                MatchHead: Dict,
+                TrackHead: Dict,
+                num_register_tokens,
+                init_values,
+                qk_norm,
+                ffn_layer,
+                patch_size,
+                enable_head_mp=False,
+                **kwargs):
+        super().__init__()
+        config_attrs = {
+            'patch_size': patch_size,
+            'init_values': init_values,
+            'qk_norm': qk_norm,
+            'ffn_layer': ffn_layer,
+            'num_register_tokens': num_register_tokens
+        }
+        if AGGREGATOR:
+            configure_dict(AGGREGATOR, **config_attrs)
+            self.aggregator = instantiate(AGGREGATOR, _recursive_=False)
+        else:
+            self.aggregator = None
+        if CameraHead:
+            configure_dict(CameraHead, **config_attrs)
+            CameraHead.loss_kwargs.pose_encoding_type = CameraHead.pose_encoding_type
+            self.camera_head_loss_kwargs = CameraHead.loss_kwargs
+            self.camera_head = instantiate(CameraHead, _recursive_=False)
+        else:
+            self.camera_head = None
+        if PointHead:
+            configure_dict(PointHead, **config_attrs)
+            self.point_head_loss_kwargs = PointHead.loss_kwargs
+            self.point_head = instantiate(PointHead, _recursive_=False)
+        else:
+            self.point_head = None
+        if DepthHead:
+            configure_dict(DepthHead, **config_attrs)
+            self.depth_head_loss_kwargs = DepthHead.loss_kwargs
+            self.depth_head = instantiate(DepthHead, _recursive_=False)
+        else:
+            self.depth_head = None
+        if MatchHead:
+            configure_dict(MatchHead, **config_attrs)
+            self.match_head_loss_kwargs = MatchHead.loss_kwargs
+            self.match_head = instantiate(MatchHead, _recursive_=False)
+        else:
+            self.match_head = None
+        if TrackHead:
+            configure_dict(TrackHead, **config_attrs)
+            self.track_head_loss_kwargs = TrackHead.loss_kwargs
+            self.track_head = instantiate(TrackHead, _recursive_=False)
+        else:
+            self.track_head = None
+        self.enable_head_mp = enable_head_mp
+        # self.mask_patch_ratio = mask_patch_ratio
+        # self.mask_patch_size = mask_patch_size
+    def forward(self, batch, device=None):
+        images = (batch["images"]) #.to(device) # B x S x 3 x H x W
+        # intrinsics = (batch["intrinsics"])#.to(device)
+        # extrinsics = (batch["extrinsics"])#.to(device)
+        B, S, C, H, W = images.shape
+        # if self.training and self.mask_patch_ratio > 0:  # Commented out masking
+        #     for _ in range(1000):
+        #         print("Please do not use mask_patch_ratio for now")
+        # predictions = {}  # Removed redundant dict
+        aggregated_tokens_list, _, patch_start_idx = self.aggregator(images, batch=batch)
+        # Pose branch
+        # TODO check pose encoding conversion  # Removed TODO
+        # loss = 0
+        predictions = {}
+        # well by default we use amp for track head
+        if self.track_head is not None:
+            track_loss_dict = self.track_head(aggregated_tokens_list, batch=batch, patch_start_idx=patch_start_idx)
+            predictions.update(track_loss_dict)
+        with torch.cuda.amp.autocast(enabled=self.enable_head_mp):
+            if self.camera_head is not None:
+                pred_pose_enc_list = self.camera_head(aggregated_tokens_list, batch=batch, patch_start_idx=patch_start_idx)
+                camera_loss_dict = {}
+                camera_loss_dict["pred_extrinsic_list"] = pred_pose_enc_list
+                # with torch.cuda.amp.autocast(enabled=False):
+                #     if not isinstance(pred_pose_enc_list, dict):
+                #         camera_loss_dict, last_pred_extrinsic = camera_loss(pred_pose_enc_list, batch, **self.camera_head_loss_kwargs)
+                #         predictions["pred_extrinsic"] = last_pred_extrinsic
+                #     else:
+                #         camera_loss_dict = pred_pose_enc_list
+                predictions.update(camera_loss_dict)
+            if self.point_head is not None:
+                pts3d, pts3d_conf = self.point_head(aggregated_tokens_list, batch=batch, patch_start_idx=patch_start_idx)
+                # with torch.cuda.amp.autocast(enabled=False):
+                #     pts3d_loss_dict = point_loss(pts3d, pts3d_conf, batch, **self.point_head_loss_kwargs)
+                # predictions.update(pts3d_loss_dict)
+                predictions["pred_world_points"] = pts3d
+                predictions["pred_world_points_conf"] = pts3d_conf
+            if self.depth_head is not None:
+                depth, depth_conf = self.depth_head(aggregated_tokens_list, batch=batch, patch_start_idx=patch_start_idx)
+                # with torch.cuda.amp.autocast(enabled=False):
+                #     depth_loss_dict = depth_loss(depth, depth_conf, batch, **self.depth_head_loss_kwargs)
+                # predictions.update(depth_loss_dict)
+                predictions["pred_depth"] = depth
+                predictions["pred_depth_conf"] = depth_conf
+            if self.match_head is not None:
+                match_loss_dict = self.match_head(aggregated_tokens_list, batch=batch, patch_start_idx=patch_start_idx)
+                predictions.update(match_loss_dict)
+        predictions.update(batch)
+        return predictions

vggt/utils/pose_enc.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+from .rotation import quat_to_mat, mat_to_quat
+# from off3d.utils.metric import closed_form_inverse_OpenCV
+def extri_intri_to_pose_encoding(
+    extrinsics,
+    intrinsics,
+    image_size_hw = None,  # e.g., (256, 512)
+    pose_encoding_type="absT_quaR_FoV",
+    min_focal_length=0.1,
+    max_focal_length=10,):
+    # extrinsics: BxSx3x4
+    # intrinsics: BxSx3x3
+    if pose_encoding_type=="absT_quaR_FoV":
+        R = extrinsics[:, :, :3, :3] # BxSx3x3
+        T = extrinsics[:, :, :3, 3] # BxSx3
+        quat = mat_to_quat(R)
+        # R_reverse = quat_to_mat(quat)
+        # Note the order of h and w here
+        H, W = image_size_hw
+        fov_h = 2 * torch.atan((H /2) / intrinsics[..., 1, 1])
+        fov_w = 2 * torch.atan((W /2) / intrinsics[..., 0, 0])
+        pose_encoding = torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
+    elif pose_encoding_type=="absT_quaR_OneFLM1":
+        # raise ValueError("Not checked after mitigrating to off3d.")
+        focal_length = intrinsics[:, :, [0,1], [0,1]]  / max(image_size_hw)
+        focal_length = focal_length.mean(dim=-1)
+        focal_length = focal_length.clamp(min_focal_length, max_focal_length)
+        focal_length = focal_length - 1
+        R = extrinsics[:, :, :3, :3]
+        T = extrinsics[:, :, :3, 3]
+        quat = mat_to_quat(R)
+        pose_encoding = torch.cat([T, quat, focal_length[..., None]], dim=-1).float()
+    else:
+        raise NotImplementedError
+    return pose_encoding
+def pose_encoding_to_extri_intri(
+    pose_encoding,
+    image_size_hw=None,  # e.g., (256, 512)
+    min_focal_length=0.1,
+    max_focal_length=10,
+    pose_encoding_type="absT_quaR_FoV",
+    build_intrinsics=True):
+    intrinsics = None
+    if pose_encoding_type == "absT_quaR_FoV":
+        T = pose_encoding[..., :3]
+        quat = pose_encoding[..., 3:7]
+        fov_h = pose_encoding[..., 7]
+        fov_w = pose_encoding[..., 8]
+        R = quat_to_mat(quat)
+        extrinsics = torch.cat([R, T[..., None]], dim=-1)
+        if build_intrinsics:
+            H, W = image_size_hw
+            fy = (H / 2.0) / torch.tan(fov_h / 2.0)
+            fx = (W / 2.0) / torch.tan(fov_w / 2.0)
+            intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device)
+            intrinsics[..., 0, 0] = fx
+            intrinsics[..., 1, 1] = fy
+            intrinsics[..., 0, 2] = W / 2
+            intrinsics[..., 1, 2] = H / 2
+            intrinsics[..., 2, 2] = 1.0 # Set the homogeneous coordinate to 1
+    elif pose_encoding_type == "absT_quaR_OneFLM1":
+        T = pose_encoding[..., :3]
+        quat = pose_encoding[..., 3:7]
+        focal_length_encoded = pose_encoding[..., 7]
+        focal_length = (focal_length_encoded + 1).clamp(min_focal_length, max_focal_length)
+        focal_length = focal_length * max(image_size_hw)
+        R = quat_to_mat(quat)
+        extrinsics = torch.cat([R, T[..., None]], dim=-1)
+        if build_intrinsics:
+            intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device)
+            intrinsics[..., 0, 0] = focal_length
+            intrinsics[..., 1, 1] = focal_length
+            intrinsics[..., 0, 2] = image_size_hw[1] / 2
+            intrinsics[..., 1, 2] = image_size_hw[0] / 2
+            # NOTE something is wrong here
+            intrinsics[..., 2, 2] = 1.0 # Set the homogeneous coordinate to 1
+            # TODO fill the principle point here, I need to check it is hw or wh
+    else:
+        raise NotImplementedError
+    return extrinsics, intrinsics
+def test_pose_encoding():
+    num_tests = 1000
+    batch_size = 4
+    num_cameras = 2
+    image_size_hw = (256, 512)
+    min_focal_length = 0.1
+    max_focal_length = 30
+    pose_encoding_type = "absT_quaR_OneFLM1"
+    for _ in range(num_tests):
+        # Generate random extrinsics and intrinsics
+        pose_encoding = torch.randn(batch_size, num_cameras, 8)
+        # converting forward and backward, and verifying the consistency
+        extrinsics, intrinsics = pose_encoding_to_extri_intri(pose_encoding, image_size_hw, min_focal_length, max_focal_length, pose_encoding_type)
+        pose_encoding_back = extri_intri_to_pose_encoding(extrinsics, intrinsics, image_size_hw, pose_encoding_type, min_focal_length, max_focal_length)
+        extrinsics_forward, intrinsics_forward = pose_encoding_to_extri_intri(pose_encoding_back, image_size_hw, min_focal_length, max_focal_length, pose_encoding_type)
+        pose_encoding_forward = extri_intri_to_pose_encoding(extrinsics_forward, intrinsics_forward, image_size_hw, pose_encoding_type, min_focal_length, max_focal_length)
+        assert torch.allclose(pose_encoding_forward[..., :7], pose_encoding_back[..., :7], atol=1e-5), "Pose encoding does not match!"
+    print("All tests passed!")
+if __name__ == "__main__":
+    test_pose_encoding()

vggt/utils/rotation.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Modified from PyTorch3D
+import torch
+import numpy as np
+import torch.nn.functional as F
+from scipy.spatial.transform import Rotation as R
+def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Quaternion Order: XYZW or say ijkr, scalar-last
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    i, j, k, r = torch.unbind(quaternions, -1)
+    # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`.
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part last, as tensor of shape (..., 4).
+        Quaternion Order: XYZW or say ijkr, scalar-last
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and
+            #  `int`.
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+    out = quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
+    ].reshape(batch_dim + (4,))
+    # Convert from rijk to ijkr
+    out = out[..., [1, 2, 3, 0]]
+    out = standardize_quaternion(out)
+    return out
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    if torch.is_grad_enabled():
+        ret[positive_mask] = torch.sqrt(x[positive_mask])
+    else:
+        ret = torch.where(positive_mask, torch.sqrt(x), ret)
+    return ret
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)
+def quat_to_mat_scipy(quaternions: np.ndarray) -> np.ndarray:
+    rotation = R.from_quat(quaternions)
+    return rotation.as_matrix()
+def mat_to_quat_scipy(matrix: np.ndarray) -> np.ndarray:
+    rotation = R.from_matrix(matrix)
+    return rotation.as_quat()
+if __name__ == "__main__":
+    num_tests = 10000  # Number of tests to run
+    tolerance = 1e-6   # Tolerance for floating point comparison
+    for _ in range(num_tests):
+        # Generate random quaternions
+        quaternions = torch.randn(1024, 4)
+        quaternions = quaternions / torch.norm(quaternions, dim=-1, keepdim=True)  # Normalize to unit quaternions
+        # Convert quaternion to matrix using PyTorch
+        matrices_torch = quat_to_mat(quaternions)
+        # Convert matrices back to quaternions using PyTorch
+        quaternions_back = mat_to_quat(matrices_torch)
+        # Standardize quaternions to handle the case where quaternions = -quaternions_back
+        quaternions = standardize_quaternion(quaternions)
+        quaternions_back = standardize_quaternion(quaternions_back)
+        # Check if the original and converted quaternions match
+        if not torch.allclose(quaternions, quaternions_back, atol=tolerance):
+            print("Mismatch found!")
+            print("Original quaternions:", quaternions)
+            print("Converted quaternions:", quaternions_back)
+            max_error = torch.max(torch.abs(quaternions - quaternions_back))
+            print("Max error:", max_error)
+    else:
+        print("All tests passed successfully!")
+    # write code here
+    # quaternions = torch.randn(1024, 4) * 20
+    # # quaternions = quaternions / torch.norm(quaternions, dim=-1, keepdim=True)  # Normalize to unit quaternions
+    # # Convert quaternion to matrix using PyTorch
+    # matrices_torch = quat_to_mat(quaternions).numpy()
+    # # Convert quaternion to matrix using SciPy
+    # matrices_scipy = quat_to_mat_scipy(quaternions.numpy())
+    # # Convert matrices back to quaternions using PyTorch
+    # quaternions_torch = mat_to_quat(torch.from_numpy(matrices_scipy)).numpy()
+    # # Convert matrices back to quaternions using SciPy
+    # quaternions_scipy = mat_to_quat_scipy(matrices_torch)
+    # reconvert_mat_diff = quat_to_mat_scipy(quaternions_torch) - quat_to_mat_scipy(quaternions_scipy)
+    # # Compare results
+    # print("Matrix conversion difference:", np.linalg.norm(matrices_torch - matrices_scipy))
+    # print("Quaternion conversion difference:", np.linalg.norm(reconvert_mat_diff))

viser_fn.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""Visualization utilities for 3D reconstruction results using Viser.
+Provides tools to visualize predicted camera poses, 3D point clouds, and confidence
+thresholding through an interactive web interface.
+"""
+import time
+from pathlib import Path
+from typing import List, Optional
+import numpy as np
+import tyro
+from tqdm.auto import tqdm
+import cv2
+import viser
+import viser.transforms as tf
+import glob
+import os
+from scipy.spatial.transform import Rotation as R
+# from camera import closed_form_inverse_se3
+import torch
+import threading
+def viser_wrapper(
+    pred_dict: dict,
+    port: int = None,
+    init_conf_threshold: float = 3.0,
+) -> None:
+    """Visualize
+    Args:
+        pred_dict: Dictionary containing predictions
+        port: Optional port number for the viser server. If None, a random port will be used.
+    """
+    print(f"Starting viser server on port {port}")  # Debug print
+    server = viser.ViserServer(host="0.0.0.0", port=port)
+    # server = viser.ViserServer(port=port)
+    server.gui.configure_theme(titlebar_content=None, control_layout="collapsible")
+    # Unpack and preprocess inputs
+    images = pred_dict["images"]
+    world_points = pred_dict["pred_world_points"]
+    conf = pred_dict["pred_world_points_conf"]
+    extrinsics = pred_dict["last_pred_extrinsic"]
+    # Handle batch dimension if present
+    if len(images.shape) > 4:
+        images = images[0]
+        world_points = world_points[0]
+        conf = conf[0]
+        extrinsics = extrinsics[0]
+    colors = images.transpose(0, 2, 3, 1)  # Convert to (B, H, W, C)
+    # Reshape for visualization
+    S, H, W, _ = world_points.shape
+    colors = (colors.reshape(-1, 3) * 255).astype(np.uint8)  # Convert to 0-255 range
+    conf = conf.reshape(-1)
+    world_points = world_points.reshape(-1, 3)
+    # Calculate camera poses in world coordinates
+    cam_to_world = closed_form_inverse_se3(extrinsics)
+    extrinsics = cam_to_world[:, :3, :]
+    # Center scene for better visualization
+    scene_center = np.mean(world_points, axis=0)
+    world_points -= scene_center
+    extrinsics[..., -1] -= scene_center
+    # set points3d as world_points
+    points = world_points
+    # frame_mask
+    frame_indices = np.arange(S)
+    frame_indices = frame_indices[:, None, None]  # Shape: (S, 1, 1, 1)
+    frame_indices = np.tile(frame_indices, (1, H, W))  # Shape: (S, H, W, 3)
+    frame_indices = frame_indices.reshape(-1)
+    ############################################################
+    ############################################################
+    gui_points_conf = server.gui.add_slider(
+        "Confidence Thres",
+        min=0.1,
+        max=20,
+        step=0.05,
+        initial_value=init_conf_threshold,
+    )
+    gui_point_size = server.gui.add_slider(
+        "Point size", min=0.00001, max=0.01, step=0.0001, initial_value=0.00001
+    )
+    # Change from "Frame Selector" to more descriptive name
+    gui_frame_selector = server.gui.add_dropdown(
+        "Filter by Frame",  # More action-oriented name
+        options=["All"] + [str(i) for i in range(S)],
+        initial_value="All",
+    )
+    # Initial mask shows all points passing confidence threshold
+    init_conf_mask = conf > init_conf_threshold
+    point_cloud = server.scene.add_point_cloud(
+        name="viser_pcd",
+        points=points[init_conf_mask],
+        colors=colors[init_conf_mask],
+        point_size=gui_point_size.value,
+        point_shape="circle",
+    )
+    frames: List[viser.FrameHandle] = []
+    def visualize_frames(extrinsics: np.ndarray, intrinsics: np.ndarray, images: np.ndarray) -> None:
+        """Send all COLMAP elements to viser for visualization. This could be optimized
+        a ton!"""
+        extrinsics = np.copy(extrinsics)
+        # Remove existing image frames.
+        for frame in frames:
+            frame.remove()
+        frames.clear()
+        def attach_callback(
+            frustum: viser.CameraFrustumHandle, frame: viser.FrameHandle
+        ) -> None:
+            @frustum.on_click
+            def _(_) -> None:
+                for client in server.get_clients().values():
+                    client.camera.wxyz = frame.wxyz
+                    client.camera.position = frame.position
+        img_ids = sorted(range(S))
+        for img_id in tqdm(img_ids):
+            cam_to_world = extrinsics[img_id]
+            T_world_camera = tf.SE3.from_matrix(cam_to_world)
+            ratio = 1
+            frame = server.scene.add_frame(
+                f"frame_{img_id}",
+                wxyz=T_world_camera.rotation().wxyz,
+                position=T_world_camera.translation(),
+                axes_length=0.05/ratio,
+                axes_radius=0.002/ratio,
+                origin_radius = 0.002/ratio
+            )
+            frames.append(frame)
+            img = images[img_id]
+            img = (img.transpose(1, 2, 0) * 255).astype(np.uint8)
+            # import pdb;pdb.set_trace()
+            H, W = img.shape[:2]
+            # fy = intrinsics[img_id, 1, 1] * H
+            fy = 1.1 * H
+            image = img
+            # image = image[::downsample_factor, ::downsample_factor]
+            frustum = server.scene.add_camera_frustum(
+                f"frame_{img_id}/frustum",
+                fov=2 * np.arctan2(H / 2, fy),
+                aspect=W / H,
+                scale=0.05/ratio,
+                image=image,
+                line_width=1.0,
+                # line_thickness=0.01,
+            )
+            attach_callback(frustum, frame)
+    @gui_points_conf.on_update
+    def _(_) -> None:
+        conf_mask = conf > gui_points_conf.value
+        frame_mask = np.ones_like(conf_mask)  # Default to all frames
+        if gui_frame_selector.value != "All":
+            selected_idx = int(gui_frame_selector.value)
+            frame_mask = (frame_indices == selected_idx)
+        combined_mask = conf_mask & frame_mask
+        point_cloud.points = points[combined_mask]
+        point_cloud.colors = colors[combined_mask]
+    @gui_point_size.on_update
+    def _(_) -> None:
+        point_cloud.point_size = gui_point_size.value
+    @gui_frame_selector.on_update
+    def _(_) -> None:
+        """Update points based on frame selection."""
+        conf_mask = conf > gui_points_conf.value
+        if gui_frame_selector.value == "All":
+            # Show all points passing confidence threshold
+            point_cloud.points = points[conf_mask]
+            point_cloud.colors = colors[conf_mask]
+        else:
+            # Show only selected frame's points
+            selected_idx = int(gui_frame_selector.value)
+            frame_mask = (frame_indices == selected_idx)
+            combined_mask = conf_mask & frame_mask
+            point_cloud.points = points[combined_mask]
+            point_cloud.colors = colors[combined_mask]
+            # Move camera to selected frame
+            # if 0 <= selected_idx < len(frames):
+            #     selected_frame = frames[selected_idx]
+            #     for client in server.get_clients().values():
+            #         client.camera.wxyz = selected_frame.wxyz
+            #         client.camera.position = selected_frame.position
+    # Initial visualization
+    visualize_frames(extrinsics, None, images)
+    # # Start server update loop in a background thread
+    def server_loop():
+        while True:
+            time.sleep(1e-3)  # Small sleep to prevent CPU hogging
+    thread = threading.Thread(target=server_loop, daemon=True)
+    thread.start()
+def closed_form_inverse_se3(se3, R=None, T=None):
+    """
+    Compute the inverse of each 4x4 (or 3x4) SE3 matrix in a batch.
+    If `R` and `T` are provided, they must correspond to the rotation and translation
+    components of `se3`. Otherwise, they will be extracted from `se3`.
+    Args:
+        se3: Nx4x4 or Nx3x4 array or tensor of SE3 matrices.
+        R (optional): Nx3x3 array or tensor of rotation matrices.
+        T (optional): Nx3x1 array or tensor of translation vectors.
+    Returns:
+        Inverted SE3 matrices with the same type and device as `se3`.
+    Shapes:
+        se3: (N, 4, 4)
+        R: (N, 3, 3)
+        T: (N, 3, 1)
+    """
+    # Check if se3 is a numpy array or a torch tensor
+    is_numpy = isinstance(se3, np.ndarray)
+    # Validate shapes
+    if se3.shape[-2:] != (4, 4) and se3.shape[-2:] != (3, 4):
+        raise ValueError(f"se3 must be of shape (N,4,4), got {se3.shape}.")
+    # Extract R and T if not provided
+    if R is None:
+        R = se3[:, :3, :3]  # (N,3,3)
+    if T is None:
+        T = se3[:, :3, 3:]  # (N,3,1)
+    # Transpose R
+    if is_numpy:
+        # Compute the transpose of the rotation for NumPy
+        R_transposed = np.transpose(R, (0, 2, 1))
+        # -R^T t for NumPy
+        top_right = -np.matmul(R_transposed, T)
+        inverted_matrix = np.tile(np.eye(4), (len(R), 1, 1))
+    else:
+        R_transposed = R.transpose(1, 2)  # (N,3,3)
+        top_right = -torch.bmm(R_transposed, T)  # (N,3,1)
+        inverted_matrix = torch.eye(4, 4)[None].repeat(len(R), 1, 1)
+        inverted_matrix = inverted_matrix.to(R.dtype).to(R.device)
+    inverted_matrix[:, :3, :3] = R_transposed
+    inverted_matrix[:, :3, 3:] = top_right
+    return inverted_matrix