EthanZyh commited on 20 days ago

Commit

8c31d70

1 Parent(s): 636d80f

copied from EthanZyh/DiffusionText2WorldGeneration

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +243 -0
LICENSE +201 -0
README.md +97 -0
RELEASE.md +7 -0
aegis.py +131 -0
ar_config_tokenizer.py +137 -0
ar_configs_base_model.py +118 -0
ar_model.py +596 -0
ar_modules_attention.py +262 -0
ar_modules_embedding.py +491 -0
ar_modules_mlp.py +50 -0
ar_modules_normalization.py +88 -0
ar_networks.py +63 -0
ar_tokenizer.py +322 -0
ar_tokenizer_image_text_tokenizer.py +318 -0
ar_tokenizer_modules.py +560 -0
ar_tokenizer_patching.py +279 -0
ar_tokenizer_quantizers.py +165 -0
ar_tokenizer_text_tokenizer.py +317 -0
ar_tokenizer_utils.py +101 -0
ar_transformer.py +461 -0
ar_utils_misc.py +52 -0
attention.py +305 -0
base_world_generation_pipeline.py +362 -0
batch_ops.py +46 -0
blocklist.py +219 -0
blocks.py +545 -0
blur_utils.py +35 -0
categories.py +192 -0
checkpoint.py +76 -0
conditioner.py +323 -0
config.json +10 -0
config.py +166 -0
config_base_conditioner.py +169 -0
config_helper.py +198 -0
convert_pixtral_ckpt.py +209 -0
cosmos1/models/POST_TRAINING.md +23 -0
cosmos1/models/autoregressive/README.md +427 -0
cosmos1/models/autoregressive/__init__.py +14 -0
cosmos1/models/autoregressive/assets/nemo/finetuned_result.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4 +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,243 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Misc
+outputs/
+checkpoints/*
+!checkpoints/README.md
+# Data types
+*.jit
+*.pt
+*.hdr
+*.webp
+*.pgm
+*.tiff
+*.tif
+*.tar
+*.tar.gz
+*.gz
+*.pkl
+*.pt
+*.bin
+# Other uncheckable file types
+*.zip
+*.exe
+*.dll
+*.swp
+*.vscode
+*.ipynb
+*.DS_Store
+*.pyc
+*Thumbs.db
+*.patch
+# Credential information that should never be checked in
+credentials
+*.secret
+# ------------------------ BELOW IS AUTO-GENERATED FOR PYTHON REPOS ------------------------
+# Byte-compiled / optimized / DLL files
+**/__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+results/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.config
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Third party
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# ruff
+.ruff_cache
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+CLIP
+.devcontainer/devcontainer.json
+# Coverage
+.coverage
+coverage.xml
+# JUnit Reports
+report.xml
+# CI-CD
+temp/
+envs.txt
+manifest.json
+# locks and t5 temp files
+*.locks*
+*.no_exist*
+*models--t5*
+# OneLogger
+wandb/
+onelogger.err
+onelogger.log

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,97 @@

+## How to Use
+```python
+from transformers import AutoModel
+model = AutoModel.from_pretrained(
+    "EthanZyh/DiffusionText2WorldGeneration",
+    cache_dir="./cache",
+    trust_remote_code=True,
+    # turn on offloading on a low GPU memory machine:
+    # offload_network=True,
+    # offload_tokenizer=True,
+    # offload_text_encoder_model=True,
+    # offload_prompt_upsampler=True,
+    # offload_guardrail_models=True,
+)
+prompt = "Some text prompt to generate a video"
+model(prompt)
+```
+![Cosmos Logo](https://github.com/NVIDIA/Cosmos/raw/main/assets/cosmos-logo.png)
+--------------------------------------------------------------------------------
+### [Website](https://www.nvidia.com/en-us/ai/cosmos/) | [HuggingFace](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6) | [GPU-free Preview](https://build.nvidia.com/explore/discover) | [Paper](https://arxiv.org/abs/2501.03575) | [Paper Website](https://research.nvidia.com/labs/dir/cosmos1/)
+[NVIDIA Cosmos](https://www.nvidia.com/cosmos/) is a developer-first world foundation model platform designed to help Physical AI developers build their Physical AI systems better and faster. Cosmos contains
+1. pre-trained models, available via [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6) under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) that allows commercial use of the models for free
+2. training scripts under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0), offered through [NVIDIA Nemo Framework](https://github.com/NVIDIA/NeMo) for post-training the models for various downstream Physical AI applications
+Details of the platform is described in the [Cosmos paper](https://research.nvidia.com/publication/2025-01_cosmos-world-foundation-model-platform-physical-ai). Preview access is avaiable at [build.nvidia.com](https://build.nvidia.com).
+## Key Features
+- [Pre-trained Diffusion-based world foundation models](cosmos1/models/diffusion/README.md) for Text2World and Video2World generation where a user can generate visual simulation based on text prompts and video prompts.
+- [Pre-trained Autoregressive-based world foundation models](cosmos1/models/autoregressive/README.md) for Video2World generation where a user can generate visual simulation based on video prompts and optional text prompts.
+- [Video tokenizers](https://github.com/NVIDIA/Cosmos-Tokenizer) for tokenizing videos into continuous tokens (latent vectors) and discrete tokens (integers) efficiently and effectively.
+- Video curation pipeline for building your own video dataset. [Coming soon]
+- [Post-training scripts](cosmos1/models/POST_TRAINING.md) via NeMo Framework to post-train the pre-trained world foundation models for various Physical AI setup.
+- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion/vae)].
+## Model Family
+| Model name | Description | Try it out |
+|------------|----------|----------|
+| [Cosmos-1.0-Diffusion-7B-Text2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-7B-Text2World) | Text to visual world generation  | [Inference](cosmos1/models/diffusion/README.md)   |
+| [Cosmos-1.0-Diffusion-14B-Text2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-14B-Text2World) | Text to visual world generation  | [Inference](cosmos1/models/diffusion/README.md)   |
+| [Cosmos-1.0-Diffusion-7B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-7B-Video2World) | Video + Text based future visual world generation  | [Inference](cosmos1/models/diffusion/README.md)   |
+| [Cosmos-1.0-Diffusion-14B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-14B-Video2World) | Video + Text based future visual world generation  | [Inference](cosmos1/models/diffusion/README.md)   |
+| [Cosmos-1.0-Autoregressive-4B](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-4B) | Future visual world generation  | [Inference](cosmos1/models/autoregressive/README.md)   |
+| [Cosmos-1.0-Autoregressive-12B](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-12B) | Future visual world generation  | [Inference](cosmos1/models/autoregressive/README.md)   |
+| [Cosmos-1.0-Autoregressive-5B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-5B-Video2World) | Video + Text based future visual world generation | [Inference](cosmos1/models/autoregressive/README.md)   |
+| [Cosmos-1.0-Autoregressive-13B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-13B-Video2World) | Video + Text based future visual world generation | [Inference](cosmos1/models/autoregressive/README.md)   |
+| [Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) | Guardrail contains pre-Guard and post-Guard for safe use | Embedded in model inference scripts |
+## Example Usage
+### Inference
+Follow the [Cosmos Installation Guide](INSTALL.md) to setup the docker. For inference with the pretrained models, please refer to [Cosmos Diffusion Inference](cosmos1/models/diffusion/README.md) and [Cosmos Autoregressive Inference](cosmos1/models/autoregressive/README.md).
+The code snippet below provides a gist of the inference usage.
+```bash
+PROMPT="A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. \
+The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. \
+A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, \
+suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. \
+The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of \
+field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."
+# Example using 7B model
+PYTHONPATH=$(pwd) python cosmos1/models/diffusion/inference/text2world.py \
+    --checkpoint_dir checkpoints \
+    --diffusion_transformer_dir Cosmos-1.0-Diffusion-7B-Text2World \
+    --prompt "$PROMPT" \
+    --offload_prompt_upsampler \
+    --video_save_name Cosmos-1.0-Diffusion-7B-Text2World
+```
+<video src="https://github.com/user-attachments/assets/db7bebfe-5314-40a6-b045-4f6ce0a87f2a">
+  Your browser does not support the video tag.
+</video>
+We also offer [multi-GPU inference](cosmos1/models/diffusion/nemo/inference/README.md) support for Diffusion Text2World WFM models through NeMo Framework.
+### Post-training
+NeMo Framework provides GPU accelerated post-training with general post-training for both [diffusion](cosmos1/models/diffusion/nemo/post_training/README.md) and [autoregressive](cosmos1/models/autoregressive/nemo/post_training/README.md) models, with other types of post-training coming soon.
+## License and Contact
+This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
+NVIDIA Cosmos source code is released under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
+NVIDIA Cosmos models are released under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). For a custom license, please contact [cosmos-license@nvidia.com](mailto:cosmos-license@nvidia.com).

RELEASE.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Release Cadence
+| Version | Description | Date |
+|------------|----------|----------|
+| [v1.0](release_notes/v0p1.md) | Initial diffusion and autoregressive WFMs release | 2025-01-06 |
+| [v0.1](release_notes/v0p1.md) | Initial tokenizer release | 2024-11-06 |

aegis.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from .log import log
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .categories import UNSAFE_CATEGORIES
+from .guardrail_core import ContentSafetyGuardrail, GuardrailRunner
+from .misc import misc, Color, timer
+SAFE = Color.green("SAFE")
+UNSAFE = Color.red("UNSAFE")
+DEFAULT_CHECKPOINT_DIR = "checkpoints/Cosmos-1.0-Guardrail/aegis"
+class Aegis(ContentSafetyGuardrail):
+    def __init__(
+        self,
+        checkpoint_dir: str = DEFAULT_CHECKPOINT_DIR,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+    ) -> None:
+        self.checkpoint_dir = checkpoint_dir
+        self.device = device
+        self.dtype = torch.bfloat16
+        base_model_id = "meta-llama/LlamaGuard-7b"
+        aegis_adapter = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
+        base_model = AutoModelForCausalLM.from_pretrained(base_model_id, cache_dir=self.checkpoint_dir)
+        self.tokenizer = AutoTokenizer.from_pretrained(base_model_id, cache_dir=self.checkpoint_dir)
+        self.model = PeftModel.from_pretrained(base_model, aegis_adapter, cache_dir=self.checkpoint_dir)
+        self.model.to(self.device, dtype=self.dtype).eval()
+    def get_moderation_prompt(self, user_prompt: str) -> str:
+        """Create the moderation prompt for the Aegis model."""
+        full_prompt = f"""[INST] Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.
+        <BEGIN UNSAFE CONTENT CATEGORIES>
+        {"".join([f"{category['instruction']}" for category in UNSAFE_CATEGORIES])}
+        <END UNSAFE CONTENT CATEGORIES>
+        <BEGIN CONVERSATION>
+        """
+        full_prompt += f"User: {user_prompt}\n\n"
+        full_prompt += """<END CONVERSATION>
+        Provide your safety assessment for User in the above conversation:
+        - First line must read 'safe' or 'unsafe'.
+        - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""
+        return full_prompt
+    def get_aegis_block_message(self, moderation_output: str) -> str:
+        """Extract the blocked category and reason from the Aegis model output."""
+        block_msg = "Prompt blocked by Aegis."
+        try:
+            # Extract blocked category and reason
+            if len(moderation_output.split("\n")) >= 2:
+                moderation_output_list = moderation_output.split("\n")
+                blocked_category = int(moderation_output_list[1].lstrip("O")) - 1
+                if 0 <= blocked_category < len(UNSAFE_CATEGORIES):
+                    blocked_reason = UNSAFE_CATEGORIES[blocked_category]["blocked_reason"]
+                    blocked_category_name = UNSAFE_CATEGORIES[blocked_category]["category"]
+                    block_msg = f"{blocked_category_name}: {blocked_reason}"
+        except Exception as e:
+            log.warning(f"Unable to extract blocked category and reason from Aegis output: {e}")
+        return block_msg
+    def filter_aegis_output(self, prompt: str) -> tuple[bool, str]:
+        """Filter the Aegis model output and return the safety status and message."""
+        full_prompt = self.get_moderation_prompt(prompt)
+        inputs = self.tokenizer([full_prompt], add_special_tokens=False, return_tensors="pt").to(self.device)
+        output = self.model.generate(**inputs, max_new_tokens=100, pad_token_id=self.tokenizer.eos_token_id)
+        prompt_len = inputs["input_ids"].shape[-1]
+        moderation_output = self.tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)
+        if "unsafe" in moderation_output.lower():
+            block_msg = self.get_aegis_block_message(moderation_output)
+            return False, block_msg
+        else:
+            return True, ""
+    def is_safe(self, prompt: str) -> tuple[bool, str]:
+        """Check if the input prompt is safe according to the Aegis model."""
+        try:
+            return self.filter_aegis_output(prompt)
+        except Exception as e:
+            log.error(f"Unexpected error occurred when running Aegis guardrail: {e}")
+            return True, "Unexpected error occurred when running Aegis guardrail."
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        help="Path to the Aegis checkpoint folder",
+        default=DEFAULT_CHECKPOINT_DIR,
+    )
+    return parser.parse_args()
+def main(args):
+    aegis = Aegis(checkpoint_dir=args.checkpoint_dir)
+    runner = GuardrailRunner(safety_models=[aegis])
+    with timer("aegis safety check"):
+        safety, message = runner.run_safety_check(args.prompt)
+    log.info(f"Input is: {'SAFE' if safety else 'UNSAFE'}")
+    log.info(f"Message: {message}") if not safety else None
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

ar_config_tokenizer.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import attrs
+from .discrete_video import DiscreteVideoFSQStateDictTokenizer
+from .ar_networks import CausalDiscreteVideoTokenizer
+from .lazy_config_init import LazyCall as L
+from .lazy_config_init import LazyDict
+def create_discrete_video_fsq_tokenizer_state_dict_config(
+    ckpt_path, pixel_chunk_duration=33, compression_ratio=[8, 16, 16]
+) -> LazyDict:
+    CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)(
+        # The new causal discrete tokenizer, that is at least 2x more efficient in memory and runtime.
+        # - It relies on fully 3D discrete wavelet transform
+        # - Uses a layer norm instead of a group norm
+        # - Factorizes full convolutions into spatial and temporal convolutions
+        # - Factorizes full attention into spatial and temporal attention
+        # - Strictly causal, with flexible temporal length at inference.
+        attn_resolutions=[32],
+        channels=128,
+        channels_mult=[2, 4, 4],
+        dropout=0.0,
+        in_channels=3,
+        num_res_blocks=2,
+        out_channels=3,
+        resolution=1024,
+        patch_size=4,
+        patch_method="haar",
+        z_channels=16,
+        z_factor=1,
+        num_groups=1,
+        legacy_mode=False,
+        spatial_compression=16,
+        temporal_compression=8,
+        embedding_dim=6,
+        levels=[8, 8, 8, 5, 5, 5],
+        name="CausalDiscreteFactorizedVideoTokenizer",
+    )
+    return L(DiscreteVideoFSQStateDictTokenizer)(
+        enc_fp=ckpt_path.replace("ema.jit", "encoder.jit"),
+        dec_fp=ckpt_path.replace("ema.jit", "decoder.jit"),
+        tokenizer_module=CausalDiscreteFactorizedVideoTokenizerConfig,
+        name="discrete_video_fsq",
+        latent_ch=6,
+        is_bf16=True,
+        pixel_chunk_duration=pixel_chunk_duration,
+        latent_chunk_duration=1 + (pixel_chunk_duration - 1) // compression_ratio[0],
+        max_enc_batch_size=8,
+        max_dec_batch_size=4,
+        levels=[8, 8, 8, 5, 5, 5],
+        compression_ratio=compression_ratio,
+    )
+@attrs.define(slots=False)
+class TextTokenizerConfig:
+    """
+    Text tokenizer config
+    Args:
+        config: Config file to define the text tokenizer class.
+        data_key (str): The input key from data_dict that will be passed to the text tokenizer.
+        tokenize_here (bool): Whether to use the tokenizer to perform online tokenization.
+        tokenizer_offset (int): Offset that is added to the tokens.
+        vocab_size (int): Vocabulary size of the tokenizer.
+    """
+    config: LazyDict
+    data_key: str = ""
+    tokenize_here: bool = False
+    tokenizer_offset: int = 0
+    vocab_size: int = 0
+@attrs.define(slots=False)
+class VideoTokenizerConfig:
+    """
+    Video tokenizer config
+    Args:
+        config: Config file to define the video tokenizer class.
+        data_key (str): The input key from data_dict that will be passed to the video tokenizer.
+        tokenize_here (bool): Whether to use the tokenizer to perform online tokenization.
+        tokenizer_offset (int): Offset that is added to the tokens. In case of joint text-video tokenizers, we
+            add an offset to make sure that video tokens and text tokens don't overlap.
+        vocab_size (int): Vocabulary size of the tokenizer.
+        max_seq_len (int): Maximum token length for an input video.
+    """
+    config: LazyDict
+    data_key: str = ""
+    tokenize_here: bool = True
+    tokenizer_offset: int = 0
+    vocab_size: int = 0
+    max_seq_len: int = -1
+@attrs.define(slots=False)
+class TokenizerConfig:
+    """
+    Joint tokenizer config
+    Args:
+        text_tokenizer (TextTokenizerConfig): Text tokenizer config file
+        class_tokenizer (ClassTokenizerConfig): Class tokenizer config file
+        video_tokenizer (VideoTokenizerConfig): Video tokenizer config file
+        image_tokenizer (ImageTokenizerConfig): Image tokenizer config file
+        seq_len (int): Final token sequence length
+        training_type (str): Type of training we use. Supports ["text_only", "text_to_video", "class_to_image", "image_text_interleaved"]
+        add_special_tokens (bool): Whether to add special tokens to the output tokens
+        pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64.
+    """
+    text_tokenizer: Optional[TextTokenizerConfig] = None
+    video_tokenizer: Optional[VideoTokenizerConfig] = None
+    seq_len: int = 4096
+    training_type: str = None
+    add_special_tokens: bool = True
+    pad_to_multiple_of: Optional[int] = 64

ar_configs_base_model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import attrs
+from .ar_config_tokenizer import TokenizerConfig
+@attrs.define
+class ModelConfig:
+    """
+    A class to hold model configuration arguments.
+    Args:
+        dim (int): The dimensionality of the input and output of each transformer block.
+        n_layers (int): Number of layers in the transformer.
+        n_heads (int): Number of attention heads.
+        n_kv_heads (Optional[int]): Number of key-value heads. If None, defaults to n_heads. Note: this is equivalent to
+            `num_gqa_groups` in TransformerEngine, where GQA means Grouped Query Attention.
+        head_dim (Optional[int]): Dimensionality of each head. If None, defaults to dim // n_heads.
+        vocab_size (int): Vocabulary size.
+        ffn_hidden_size (int): Hidden size for feedforward network.
+        norm_eps (float): Epsilon value for normalization.
+        rope_theta (float): Theta value for rotary positional embeddings.
+        apply_abs_pos_emb (bool): Whether to apply absolute position embeddings.
+        max_batch_size (int): Maximum batch size for inference.
+        max_seq_len (int): Maximum sequence length for input text.
+        fuse_qkv (bool): Whether to fuse QKV in attention. Defaults to True.
+        causal_mask (bool): Whether to use causal mask. Defaults to True.
+        norm_type (str): Type of normalization layer. Choices: "rmsnorm", "fused_rmsnorm", "layernorm", "np_layernorm".
+        precision (str): Data type for the model.
+        use_qk_normalization (bool): Whether to enable QK normalization.
+        ckpt_dir (str): Checkpoint directory.
+        ckpt_path (str): Checkpoint path.
+        apply_yarn (Optional[bool]): Whether to apply YaRN (long-context extension).
+        yarn_scale (Optional[float]): Scale factor for YaRN.
+        yarn_beta_fast (Optional[int]): Beta fast variable for YaRN (i.e., low_freq_factor in Llama 3.1 RoPE scaling code)
+        yarn_beta_slow (Optional[int]): Beta slow variable for YaRN (i.e., high_freq_factor in Llama 3.1 RoPE scaling code)
+        original_seq_len (Optional[int]): Original sequence length.
+        vision_encoder (Optional[str]): Vision encoder name.
+        mm_projector (Optional[str]): Multi-modal projector name.
+        vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4-channel images with the last channel as the alpha channel, set this to 4.
+        rope_dim (Optional[str]): Dimensionality of the RoPE. Choices: "1D", "3D".
+        pytorch_rope_version (Optional[str]): Version of the PyTorch RoPE implementation. Choices: "v1", "v2".
+        original_latent_shape (Optional[list]): Original shape of the latent tensor needed for rope extension.
+        pad_to_multiple_of (Optional[int]): Pad the position embedding to a multiple of this value.
+        vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3.
+        insert_cross_attn (bool): Whether to insert the cross-attention layers after each multi-head self-attention (MSA) layer.
+        insert_cross_attn_every_k_layers (int): Insert cross-attention layers every k TransformerLayers.
+        context_dim (Optional[int]): The dimensionality of cross-attention embedding, e.g., T5 embed feature dim.
+        num_video_frames (Optional[int]): Number of video frames.
+        video_height (Optional[int]): Raw video pixel height dimension.
+        video_width (Optional[int]): Raw video pixel width dimension.
+        video_latent_shape (Optional[list]): Video tokenizer output dimension, in (T,H,W).
+    """
+    dim: int = attrs.field(default=4096)
+    n_layers: int = attrs.field(default=32)
+    n_heads: int = attrs.field(default=32)
+    n_kv_heads: Optional[int] = attrs.field(default=8)
+    head_dim: Optional[int] = attrs.field(default=None)
+    vocab_size: int = attrs.field(default=128256)
+    ffn_hidden_size: int = attrs.field(default=14336)
+    norm_eps: float = attrs.field(default=1e-5)
+    rope_theta: float = attrs.field(default=500000)
+    apply_abs_pos_emb: bool = attrs.field(default=False)
+    max_batch_size: int = attrs.field(default=1)
+    max_seq_len: int = attrs.field(default=8192)
+    fuse_qkv: bool = attrs.field(default=False)
+    causal_mask: bool = attrs.field(default=True)
+    norm_type: str = attrs.field(default="rmsnorm")
+    precision: str = attrs.field(default="bfloat16")
+    use_qk_normalization: bool = False
+    tokenizer: Optional[TokenizerConfig] = None
+    ckpt_dir: Optional[str] = attrs.field(default=None)
+    ckpt_path: Optional[str] = attrs.field(
+        default=None
+    )  # If not None, load the model from this path instead of ckpt_dir
+    apply_yarn: Optional[bool] = attrs.field(default=False)
+    yarn_scale: Optional[float] = attrs.field(default=None)
+    yarn_beta_fast: Optional[int] = attrs.field(default=None)
+    yarn_beta_slow: Optional[int] = attrs.field(default=None)
+    original_seq_len: Optional[int] = attrs.field(default=None)
+    vision_encoder: Optional[str] = attrs.field(default=None)
+    vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
+    mm_projector: Optional[str] = attrs.field(default=None)
+    rope_dim: Optional[str] = attrs.field(default="1D")
+    pytorch_rope_version: Optional[str] = attrs.field(default="v2")
+    original_latent_shape: Optional[list] = None
+    pad_to_multiple_of: Optional[int] = None
+    vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
+    insert_cross_attn: bool = False
+    insert_cross_attn_every_k_layers: int = 1
+    context_dim: Optional[int] = attrs.field(default=1024)
+    # For video training
+    num_video_frames: Optional[int] = None
+    # Raw video pixel dimension
+    video_height: Optional[int] = None
+    video_width: Optional[int] = None
+    # Video tokenizer output dimension, in (T,H,W), it's computed by num_video_frames/temporal_compress_factor, video_height/spatial_compression_fact, video_width/spatial_compression_fact
+    video_latent_shape: Optional[list] = None
+    def __getitem__(self, item):
+        return getattr(self, item)

ar_model.py ADDED Viewed

	@@ -0,0 +1,596 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+from .log import log
+import torch
+from safetensors.torch import load_file
+from torch.nn.modules.module import _IncompatibleKeys
+from .ar_configs_base_model import ModelConfig
+from .ar_config_tokenizer import TokenizerConfig
+from .mm_projector import MultimodalProjector
+from .ar_transformer import Transformer
+from .vit import VisionTransformer, get_vit_config
+from .ar_tokenizer import DiscreteMultimodalTokenizer, update_vocab_size
+from .checkpoint import (
+    get_partial_state_dict,
+    process_state_dict,
+    substrings_to_ignore,
+)
+from .sampling import decode_n_tokens, decode_one_token, prefill
+from .misc import misc, Color, timer
+class AutoRegressiveModel(torch.nn.Module):
+    """
+    A class to build and use a AutoRegressiveModel model for text generation.
+    Methods:
+        build: Build a AutoRegressiveModel instance by initializing and loading a model checkpoint.
+        generate: Generate text sequences based on provided prompts using the language generation model.
+    """
+    def __init__(
+        self,
+        model: Transformer = None,
+        tokenizer: DiscreteMultimodalTokenizer = None,
+        config: ModelConfig = None,
+        vision_encoder: VisionTransformer = None,
+        mm_projector: MultimodalProjector = None,
+    ):
+        """
+        Initialize the AutoRegressiveModel instance with a model and tokenizer.
+        Args:
+            model (Transformer): The Transformer model for text generation.
+            tokenizer (Tokenizer): The tokenizer for encoding and decoding text.
+            config (Config): The configuration for the AutoRegressiveModel model.
+            vision_encoder (VisionTransformer): The vision encoder for the AutoRegressiveModel model.
+            mm_projector (MultimodalProjector): The multi-modal projector for the AutoRegressiveModel model.
+        """
+        super().__init__()
+        self.model = model
+        self.tokenizer = tokenizer
+        self.config = config
+        self.vision_encoder = vision_encoder
+        self.mm_projector = mm_projector
+    @property
+    def precision(self):
+        return self.model.precision
+    def get_num_params(
+        self,
+    ) -> int:
+        """
+        Return the number of parameters in the model.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        return n_params
+    def load_ar_model(
+        self,
+        tokenizer_config,
+    ):
+        """
+        Load the AR model.
+        """
+        model_config = self.config
+        ckpt_path = model_config.ckpt_path
+        with timer(f"loading checkpoint from {ckpt_path}"):
+            if ckpt_path.endswith("safetensors"):
+                # Load with safetensors API
+                checkpoint = load_file(ckpt_path, device="cpu")
+            else:
+                # The pytorch version
+                checkpoint = torch.load(
+                    ckpt_path,
+                    map_location="cpu",
+                    mmap=True,  # load the checkpoint in memory-mapped mode
+                    weights_only=True,
+                )
+        llm_checkpoint = checkpoint["model"] if "model" in checkpoint else checkpoint
+        orig_precision = torch.get_default_dtype()
+        precision = getattr(torch, model_config.precision)
+        torch.set_default_dtype(precision)
+        log.debug(f"Setting torch default dtype to {precision}")
+        model = Transformer(
+            params=model_config,
+            tokenizer_config=tokenizer_config,
+        )
+        log.debug(
+            f"tokenizer tokenizer_config.video_tokenizer.vocab_size {tokenizer_config.video_tokenizer.vocab_size}"
+        )
+        vocab_size = update_vocab_size(
+            existing_vocab_size=0,
+            to_be_added_vocab_size=tokenizer_config.video_tokenizer.vocab_size,
+            training_type=tokenizer_config.training_type,
+            add_special_tokens=False,
+        )
+        log.debug(
+            f"tokenizer tokenizer_config.video_tokenizer.vocab_size {tokenizer_config.video_tokenizer.vocab_size}  vocab_size {vocab_size}"
+        )
+        # Perform vocab expansion
+        if vocab_size > model.vocab_size:
+            log.debug(f"Expanding vocab size to {vocab_size}")
+            # For text-to-video training, we only expand the embedding layer but not the output (unembedding) layer,
+            expand_output_layer = not (tokenizer_config.training_type == "text_to_video")
+            model.expand_vocab(
+                vocab_size,
+                init_method="gaussian",
+                expand_output_layer=expand_output_layer,
+            )
+        # Remove the "model." prefix in the state_dict
+        llm_checkpoint = process_state_dict(llm_checkpoint, prefix_to_remove="model.")
+        with timer("loading state_dict into model"):
+            missing_keys, _ = model.load_state_dict(llm_checkpoint, strict=True)
+        # Remove keys with "_extra_state" suffix in missing_keys (defined by TransformerEngine for FP8 usage)
+        missing_keys = [k for k in missing_keys if not k.endswith("_extra_state")]
+        assert len(missing_keys) == 0, f"Missing keys: {missing_keys}"
+        self.model = model.to(precision).to("cuda")
+        torch.set_default_dtype(orig_precision)  # Reset the default dtype to the original value
+    def load_tokenizer(self, tokenizer_config):
+        """
+        Load the tokenizer.
+        """
+        self.tokenizer = DiscreteMultimodalTokenizer(tokenizer_config)
+    @staticmethod
+    def build(
+        model_config: ModelConfig = ModelConfig(),
+        tokenizer_config: TokenizerConfig = None,
+    ) -> "AutoRegressiveModel":
+        """
+        Build a AutoRegressiveModel instance by initializing and loading a model checkpoint.
+        Args:
+            model_config (ModelConfig, optional): The model configuration for the AutoRegressiveModel instance. Defaults to ModelConfig().
+            tokenizer_config (TokenizerConfig, optional): The tokenizer configuration for the AutoRegressiveModel instance. Defaults to None.
+            download_rank_sync (bool, optional): Whether to download the checkpoint in a rank-synchronized manner. Defaults to True.
+        Returns:
+            AutoRegressiveModel: An instance of the AutoRegressiveModel class with the loaded model and tokenizer.
+        Raises:
+            AssertionError: If there are no checkpoint files in the specified directory.
+        Note:
+            This method sets the device to CUDA and loads the pre-trained model and tokenizer.
+        """
+        # Initialize model configuration parameters
+        config_params = {}
+        # Load checkpoint and model parameters
+        if model_config.ckpt_path is None:
+            # If ckpt_path is not provided, we assume the model checkpoint is saved in the ckpt_dir
+            ckpt_dir = model_config.ckpt_dir
+            # We prioritize safetensors version over the pytorch version, since the former is
+            # much faster for checkpoint loading.
+            checkpoints = sorted(Path(ckpt_dir).glob("*.safetensors"))
+            if len(checkpoints) == 0:
+                checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
+            assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
+            assert (
+                len(checkpoints) == 1
+            ), f"multiple checkpoint files found in {ckpt_dir} (currently only one is supported)"
+            ckpt_path = str(checkpoints[0])  # Assuming single checkpoint for non-parallel case
+            if os.path.exists(Path(ckpt_dir) / "config.json"):
+                with open(Path(ckpt_dir) / "config.json", "r") as f:
+                    config_params = json.loads(f.read())
+            else:
+                log.info(
+                    f"No params.json found in the checkpoint directory ({ckpt_dir}). " f"Using default model config."
+                )
+        else:
+            # If ckpt_path is provided, we load the model from the specified path,
+            # and use the default model configuration
+            ckpt_path = model_config.ckpt_path
+        for key, value in config_params.items():
+            if hasattr(model_config, key):
+                # Override the default model configuration with the parameters from the checkpoint
+                setattr(model_config, key, value)
+        with timer(f"loading checkpoint from {ckpt_path}"):
+            if ckpt_path.endswith("safetensors"):
+                # Load with safetensors API
+                checkpoint = load_file(ckpt_path, device="cpu")
+            else:
+                # The pytorch version
+                checkpoint = torch.load(
+                    ckpt_path,
+                    map_location="cpu",
+                    mmap=True,  # load the checkpoint in memory-mapped mode
+                    weights_only=True,
+                )
+        llm_checkpoint = checkpoint["model"] if "model" in checkpoint else checkpoint
+        if model_config.vision_encoder is not None:
+            # Take the LLM weights (starting with "model.") from the VLM checkpoint
+            llm_checkpoint = get_partial_state_dict(llm_checkpoint, prefix="model.")
+        if model_config.vision_encoder is not None:
+            # For vanilla VLM ckpt before fine-tuning, `checkpoint['model']` only contains LLM weights, and `checkpoint['vision_encoder']`
+            #   and `checkpoint['mm_projector']` are both for those weights
+            # For fine-tuned VLM ckpt, `checkpoint['model']` contains all LLM, mm_projector and vision_encoder weights
+            if "vision_encoder" in checkpoint:
+                log.debug("Using pretrained vision_encoder")
+                vit_checkpoint = checkpoint["vision_encoder"]
+            else:
+                log.debug("Using fine-tuned vision_encoder")
+                vit_checkpoint = get_partial_state_dict(llm_checkpoint, prefix="vision_encoder.")
+                vit_checkpoint = process_state_dict(vit_checkpoint, prefix_to_remove="vision_encoder.")
+            if "mm_projector" in checkpoint:
+                log.debug("Using pretrained mm_projector")
+                projector_checkpoint = checkpoint["mm_projector"]
+            else:
+                log.debug("Using fine-tuned mm_projector")
+                projector_checkpoint = get_partial_state_dict(llm_checkpoint, prefix="mm_projector.")
+                projector_checkpoint = process_state_dict(projector_checkpoint, prefix_to_remove="mm_projector.")
+            assert (
+                len(vit_checkpoint) > 0 and len(projector_checkpoint) > 0
+            ), "vit_checkpoint and projector_checkpoint cannot be empty. We do not support random initialization for vision_encoder and mm_projector."
+        tokenizer = DiscreteMultimodalTokenizer(tokenizer_config)
+        orig_precision = torch.get_default_dtype()
+        precision = getattr(torch, model_config.precision)
+        torch.set_default_dtype(precision)
+        log.debug(f"Setting torch default dtype to {precision}")
+        model = Transformer(
+            params=model_config,
+            tokenizer_config=tokenizer_config,
+        )
+        model_kwargs = {}
+        if model_config.vision_encoder is not None:
+            assert model_config.mm_projector is not None, "mm_projector must be provided if vision_encoder is provided."
+            vit_config = get_vit_config(model_config.vision_encoder)
+            vision_encoder = VisionTransformer.build(
+                vit_config,
+            )
+            mm_projector = MultimodalProjector(
+                mm_projector_type=model_config.mm_projector, in_dim=vit_config["dim"], out_dim=model_config["dim"]
+            )
+            model_kwargs.update({"vision_encoder": vision_encoder, "mm_projector": mm_projector})
+        # Perform vocab expansion
+        if tokenizer.vocab_size > model.vocab_size:
+            log.debug(f"Expanding vocab size to {tokenizer.vocab_size}")
+            # For text-to-video training, we only expand the embedding layer but not the output (unembedding) layer,
+            expand_output_layer = not (tokenizer.training_type == "text_to_video")
+            model.expand_vocab(
+                tokenizer.vocab_size,
+                init_method="gaussian",
+                expand_output_layer=expand_output_layer,
+            )
+        # Remove the "model." prefix in the state_dict
+        llm_checkpoint = process_state_dict(llm_checkpoint, prefix_to_remove="model.")
+        with timer("loading state_dict into model"):
+            missing_keys, unexpected_keys = model.load_state_dict(llm_checkpoint, strict=True)
+        # Remove keys with "_extra_state" suffix in missing_keys (defined by TransformerEngine for FP8 usage)
+        missing_keys = [k for k in missing_keys if not k.endswith("_extra_state")]
+        assert len(missing_keys) == 0, f"Missing keys: {missing_keys}"
+        if model_config.vision_encoder is not None:
+            vision_encoder.load_state_dict(vit_checkpoint)
+            mm_projector.load_state_dict(projector_checkpoint)
+            if model_config.vision_encoder_in_channels != 3:
+                vision_encoder.expand_in_channels(model_config.vision_encoder_in_channels)
+        model = model.to(precision)  # ensure model parameters are in the correct precision
+        log.debug(f"Model config: {model_config}")
+        model_class = AutoRegressiveModel
+        torch.set_default_dtype(orig_precision)  # Reset the default dtype to the original value
+        return model_class(model, tokenizer, model_config, **model_kwargs)
+    @torch.no_grad()
+    def generate(
+        self,
+        prompt_tokens: List[List[int]] | torch.Tensor,
+        max_gen_len: int,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        num_gen_seq: int = 1,
+        logprobs: bool = False,
+        echo: bool = False,
+        seed: int = None,
+        context: Optional[torch.Tensor] = None,
+        context_mask: Optional[torch.Tensor] = None,
+        compile_sampling: bool = True,
+        compile_prefill: bool = False,
+        verbose: bool = True,
+        stop_tokens: Optional[Set[int]] = None,
+        images: Optional[torch.Tensor] = None,
+    ):
+        """
+        Autoregressive generation built upon the gpt-fast implementation (https://github.com/pytorch-labs/gpt-fast).
+        Args:
+            prompt_tokens (List[List[int]] | torch.Tensor): A single prompt of shape (1, seq_len).
+            max_gen_len (int): Maximum length of the generated text sequence.
+            temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            top_k (int, optional): Top-k value for top-k sampling. Defaults to None.
+            top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to None.
+            num_gen_seq (int, optional): Number of outputs to generate given the same prompt. Defaults to 1. When temperature == 0, num_gen_seq must be 1 because the generation is deterministic.
+            echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
+            logit_clipping_range (list, optional): Range of logits to clip. Defaults to [].
+            seed (int, optional): Random seed for reproducibility. Defaults to None.
+            compile_sampling (bool, optional): Flag indicating whether to compile the decoding function. Defaults to True.
+            compile_prefill (bool, optional): Flag indicating whether to compile the prefill function. Defaults to False.
+            verbose (bool, optional): Flag indicating whether to print the the time. Defaults to False.
+        """
+        assert top_k is None or top_p is None, f"Only one of top_k ({top_k} or top_p ({top_p} should be specified."
+        if temperature == 0:
+            top_p, top_k = None, None
+            log.debug("Setting top_p and top_k to None because temperature is 0")
+        if top_p is not None:
+            log.debug(f"Using top-p sampling with p={top_p} and temperature={temperature}")
+        elif top_k is not None:
+            log.debug(f"Using top-k sampling with k={top_k} and temperature={temperature}")
+        else:
+            log.debug("Not applying top-k or top-p sampling. Will use top-k sampling with k=None")
+        orig_precision = torch.get_default_dtype()
+        torch.set_default_dtype(self.precision)
+        torch._inductor.config.coordinate_descent_tuning = True
+        torch._inductor.config.triton.unique_kernel_names = True
+        # Experimental features to reduce compilation times, will be on by default in future
+        torch._inductor.config.fx_graph_cache = True
+        if seed is not None:
+            misc.set_random_seed(seed)
+        assert not logprobs, "logprobs are not supported for fast_generate yet"
+        # Examine if the function prefil and decode_one_token functions are compiled yet. If not, compile them based on the flags
+        if compile_sampling and not getattr(self, "inference_decode_compiled", False):
+            self.decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
+            self.inference_decode_compiled = True
+            log.info("Compiled AR sampling function. Note: the first run will be slower due to compilation")
+        if compile_prefill and not getattr(self, "inference_prefill_compiled", False):
+            self.prefill = torch.compile(prefill, fullgraph=True, dynamic=True)
+            self.inference_prefill_compiled = True
+            log.info("Compiled prefill function. Note: the first run will be slower due to compilation")
+        if not hasattr(self, "decode_one_token"):
+            self.decode_one_token = decode_one_token
+        if not hasattr(self, "prefill"):
+            self.prefill = prefill
+        # Initialization and Assertions
+        if isinstance(self.model.params, list):
+            # During training, model.params is a list
+            log.debug(
+                f"Find self.model.params is a list, use self.config instead. Get max_batch_size={self.config.max_batch_size}, max_seq_len={self.config.max_seq_len}"
+            )
+            params = self.config
+        else:
+            params = self.model.params
+        if isinstance(prompt_tokens, list):
+            prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device="cuda")
+        if prompt_tokens.ndim == 1:
+            prompt_tokens = prompt_tokens.view(1, -1)
+        else:
+            assert prompt_tokens.ndim == 2, f"prompt_tokens has shape {prompt_tokens.shape}"
+        batch_size, prompt_len = prompt_tokens.shape
+        total_len = min(params.max_seq_len, max_gen_len + prompt_len)
+        if max_gen_len + prompt_len > params.max_seq_len:
+            log.warning(
+                f"max_gen_len + prompt_len={max_gen_len + prompt_len} exceeds max_seq_len={params.max_seq_len}, truncate max_gen_len to {params.max_seq_len - prompt_len}"
+            )
+            max_gen_len = params.max_seq_len - prompt_len
+        if context_mask is not None:
+            context_mask = context_mask.to(dtype=torch.bool)
+            if context_mask.ndim == 2:
+                assert (
+                    context_mask.shape[0] == batch_size
+                ), f"batch_size mismatch: {context_mask.shape[0]} != {batch_size}"
+                # Unsqueeze it to make it of shape [batch_size, 1, 1, context_seq_len]
+                context_mask = context_mask.view(batch_size, 1, 1, -1)
+        if num_gen_seq > 1:
+            assert (
+                batch_size == 1
+            ), f"num_gen_seq > 1 is only supported for a single prompt, got {len(prompt_tokens)} prompts"
+            log.debug(f"Generating {num_gen_seq} sequences with the same prompt")
+            assert (
+                num_gen_seq <= params.max_batch_size
+            ), f"num_gen_seq={num_gen_seq} exceeds max_batch_size={params.max_batch_size}"
+            # repeat the prompt tokens for num_gen_seq times
+            prompt_tokens = prompt_tokens.repeat(num_gen_seq, 1)
+            assert prompt_tokens.shape == (
+                num_gen_seq,
+                prompt_len,
+            ), f"prompt_tokens must be of shape (num_gen_seq, seq_len), got {prompt_tokens.shape}"
+            batch_size = len(prompt_tokens)
+        # create an empty tensor of the expected final shape and fill in the current tokens
+        empty = torch.empty(batch_size, total_len, dtype=prompt_tokens.dtype, device=prompt_tokens.device)
+        empty[:, :prompt_len] = prompt_tokens
+        seq = empty
+        input_pos = torch.arange(0, prompt_len, device="cuda")
+        if verbose:
+            prefill_start = time.time()
+        if images is not None:
+            images = images.to(device=prompt_tokens.device, dtype=torch.bfloat16)
+            prompt_token_embeddings = self.embed_vision_language_features(prompt_tokens, images)
+        else:
+            prompt_token_embeddings = None
+        if context is not None:
+            context = context.to(device=prompt_tokens.device, dtype=self.precision)
+        # Prefill stage
+        next_token = self.prefill(
+            self.model,
+            input_pos=input_pos,
+            tokens=prompt_tokens if prompt_token_embeddings is None else None,
+            token_embeddings=prompt_token_embeddings,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            context=context,
+            context_mask=context_mask,
+        )
+        if verbose:
+            prefill_time = time.time() - prefill_start
+        seq[:, [prompt_len]] = next_token.to(dtype=seq.dtype)
+        input_pos = torch.tensor([prompt_len], dtype=torch.long, device="cuda")
+        stop_tokens = self.tokenizer.stop_tokens if stop_tokens is None else stop_tokens
+        stop_tokens = torch.tensor(list(stop_tokens), dtype=torch.long, device="cuda")
+        if verbose:
+            decode_start = time.time()
+        # Decode stage
+        generated_tokens = decode_n_tokens(
+            self.model,
+            next_token.view(batch_size, -1),
+            input_pos,
+            max_gen_len - 1,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            stop_tokens=stop_tokens,
+            decode_one_token_function=self.decode_one_token,
+            context=context,
+            context_mask=context_mask,
+        )
+        gen_len = len(generated_tokens)
+        if verbose:
+            decode_time = time.time() - decode_start
+            prefill_throughput = prompt_len / prefill_time
+            decode_throughput = gen_len / decode_time
+            log.debug(f"[Prefill] Time: {prefill_time:.2f}s; Throughput: {prefill_throughput:.2f} tokens/s")
+            log.debug(f"[Decode] Time: {decode_time:.2f}s; Throughput: {decode_throughput:.2f} tokens/s")
+        generated_tokens = torch.cat(generated_tokens, dim=1)
+        log.debug(f"generated_tokens: {generated_tokens.shape}")
+        seq = seq[:, : prompt_len + 1 + gen_len]
+        seq[:, prompt_len + 1 :] = generated_tokens
+        if not echo:
+            seq = seq[:, prompt_len:]
+        torch.set_default_dtype(orig_precision)  # Reset the default dtype to the original value
+        return seq, None
+    def embed_vision_language_features(self, input_ids: torch.Tensor, images: torch.tensor) -> torch.Tensor:
+        """
+        Embed vision and language features into a combined representation.
+        Args:
+            input_ids (torch.Tensor): Input token IDs.
+            images (torch.tensor): Input images.
+        Returns:
+            torch.Tensor: Combined vision-language features.
+        Raises:
+            AssertionError: If vision encoder or mm projector is not initialized,
+                            or if dimensions mismatch.
+        """
+        # Ensure vision encoder and mm projector are initialized
+        assert self.vision_encoder is not None
+        assert self.mm_projector is not None
+        # Get image token ID and validate it
+        image_token_id = self.vision_encoder.image_token_id
+        assert isinstance(image_token_id, int) and image_token_id >= 0, f"Invalid image_token_id: {image_token_id}"
+        # Identify text and image locations in the input
+        text_locations = input_ids != image_token_id
+        image_locations = input_ids == image_token_id
+        # Process text features
+        text_features = self.model.tok_embeddings(input_ids[text_locations])
+        # Process image features
+        images = images.to(device=text_features.device, dtype=text_features.dtype)
+        vit_outputs = self.vision_encoder(images)
+        image_features = self.mm_projector(vit_outputs)
+        # Get dimensions
+        B, seq_len = input_ids.shape
+        N_total = B * seq_len
+        N_txt, D_txt = text_features.shape
+        N_img, N_patch, D_img = image_features.shape
+        # Reshape image features
+        image_features = image_features.reshape(N_img * N_patch, D_img)
+        # Validate dimensions
+        assert D_txt == D_img, f"Text features dim {D_txt} should be equal to image features dim {D_img}"
+        assert (
+            N_total == N_txt + N_img * N_patch
+        ), f"seq_len {seq_len} should be equal to N_txt + N_img*N_Patch {(N_txt, N_img * N_patch, image_locations.sum().item())}"
+        # Combine text and image features
+        combined_features = torch.empty(
+            (B, seq_len, D_txt),
+            dtype=text_features.dtype,
+            device=text_features.device,
+        )
+        combined_features[text_locations, :] = text_features
+        combined_features[image_locations, :] = image_features
+        return combined_features
+    def state_dict(self, *args, **kwargs):
+        """
+        Process the state dict (e.g., remove "_extra_state" keys imposed by TransformerEngine for FP8).
+        """
+        state_dict = super().state_dict(*args, **kwargs)
+        return process_state_dict(state_dict)
+    def load_state_dict(self, state_dict: Dict[str, Any], strict: bool = True, assign: bool = False):
+        """
+        Ignore the missing keys with substrings matching `substring_to_ignore` (e.g., "_extra_state" keys imposed by
+        TransformerEngine for FP8).
+        """
+        state_dict = process_state_dict(state_dict)
+        missing_keys, unexpected_keys = super().load_state_dict(state_dict, strict=False, assign=assign)
+        actual_missing_keys = []
+        for key in missing_keys:
+            if not any(substring in key for substring in substrings_to_ignore):
+                actual_missing_keys.append(key)
+        if strict:
+            if len(actual_missing_keys) > 0 or len(unexpected_keys) > 0:
+                raise ValueError(f"Missing keys: {actual_missing_keys}\n\nUnexpected keys: {unexpected_keys}")
+        return _IncompatibleKeys(actual_missing_keys, unexpected_keys)

ar_modules_attention.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Union
+import torch
+from torch import nn
+from .ar_modules_embedding import RotaryPositionEmbedding
+from .ar_modules_normalization import create_norm
+class Attention(nn.Module):
+    """
+    Attenion layer with KV cache.
+    """
+    def __init__(
+        self,
+        n_heads: int,
+        n_kv_heads: Union[int, None],
+        dim: int,
+        max_batch_size: int,
+        max_seq_len: int,
+        context_dim: Optional[int] = None,
+        use_qk_normalization: bool = False,
+        norm_type: str = "rmsnorm",
+        norm_eps: float = 1e-5,
+        causal_mask: Optional[bool] = True,
+        head_dim: Optional[int] = None,
+        fuse_qkv: bool = False,
+        precision: str = "bfloat16",
+        attn_type: str = "self",
+    ):
+        """
+        Initializes the GQA module.
+        Args:
+            n_heads (int): The number of attention heads.
+            n_kv_heads (int, optional): The number of key-value attention heads. None defaults to n_heads.
+            dim (int): The dimensionality of the input and output.
+            max_batch_size (int): The maximum batch size.
+            max_seq_len (int): The maximum sequence length.
+            context_dim (int, optional): The dimensionality of the context for cross-attn. Defaults to None.
+            use_qk_normalization (bool, optional): Whether to apply QK normalization. Defaults to False.
+            norm_type (str, optional): The type of normalization layer. Defaults to "rmsnorm".
+            norm_eps (float, optional): The epsilon value for normalization. Defaults to 1e-5.
+            causal_mask (bool, optional): Whether to use causal mask. Defaults to True.
+            head_dim (int, optional): The dimensionality of each attention head. If None, defaults to dim // n_heads.
+            fuse_qkv (bool, optional): Whether to fuse QKV. Defaults to False.
+            precision (str, optional): The precision of the module. Defaults to "bfloat16".
+            attn_type (str, optional): The type of attention. Defaults to "self".
+        """
+        super().__init__()
+        assert attn_type in ["self", "cross", "full"], f"Invalid attention type: {attn_type}"
+        self.attn_type = attn_type
+        context_dim = dim if context_dim is None else context_dim
+        self.dim = dim
+        self.context_dim = context_dim
+        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        self.n_local_kv_heads = self.n_kv_heads
+        self.n_local_heads = n_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = dim // n_heads if head_dim is None else head_dim
+        self.causal_mask = causal_mask
+        self.fuse_qkv = fuse_qkv
+        self.precision = precision
+        if fuse_qkv:
+            assert context_dim == dim, f"Fuse QKV requires context_dim ({context_dim}) to be equal to dim ({dim})"
+            self.total_local_head_dim = (self.n_local_heads + 2 * self.n_local_kv_heads) * self.head_dim
+            self.wqkv = nn.Linear(dim, self.total_local_head_dim, bias=False)
+            # Register hook to load fused QKV weights
+            self._register_load_state_dict_pre_hook(self.load_hook)
+        else:
+            self.wq = nn.Linear(dim, self.n_local_heads * self.head_dim, bias=False)
+            self.wk = nn.Linear(context_dim, self.n_local_kv_heads * self.head_dim, bias=False)
+            self.wv = nn.Linear(context_dim, self.n_local_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(self.n_local_heads * self.head_dim, dim, bias=False)
+        self.max_batch_size = max_batch_size
+        self.max_seq_len = max_seq_len
+        if self.attn_type == "self":
+            # Cache for key and value tensors
+            self.init_kv_cache()
+        # QK normalization layers
+        if use_qk_normalization:
+            self.q_norm = create_norm(norm_type, dim=self.head_dim, eps=norm_eps)
+            self.k_norm = create_norm(norm_type, dim=self.head_dim, eps=norm_eps)
+        self.use_qk_normalization = use_qk_normalization
+        self.to(dtype=getattr(torch, self.precision))
+    def load_hook(self, state_dict, prefix, *args):
+        if prefix + "wq.weight" in state_dict:
+            wq = state_dict.pop(prefix + "wq.weight")
+            wk = state_dict.pop(prefix + "wk.weight")
+            wv = state_dict.pop(prefix + "wv.weight")
+            state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv])
+    def init_kv_cache(self, dtype=None):
+        cache_shape = (self.max_batch_size, self.n_local_kv_heads, self.max_seq_len, self.head_dim)
+        if dtype is None:
+            dtype = getattr(torch, self.precision)
+        if self.attn_type == "self":
+            self.cache_k = torch.zeros(cache_shape, dtype=dtype).cuda()
+            self.cache_v = torch.zeros(cache_shape, dtype=dtype).cuda()
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RotaryPositionEmbedding,
+        input_pos: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+    ):
+        """
+        Forward pass of GQA.
+        Args:
+            x: The input tensor of shape (batch_size, seq_len, dim).
+            rope: The rotary positional embedding module.
+            input_pos: The starting position of the current sequence.
+            mask: The attention mask tensor.
+            context: The context tensor of shape (batch_size, context_len, dim).
+        Returns:
+            The output tensor after applying GQA.
+        """
+        bsz, seqlen, _ = x.shape
+        # Use one single module to handle both self-attn and cross-attn
+        context = x if context is None else context
+        context_len = seqlen if context is None else context.shape[1]
+        if self.fuse_qkv:
+            q_size = self.n_local_heads * self.head_dim
+            kv_size = self.n_local_kv_heads * self.head_dim
+            xq, xk, xv = self.wqkv(x).split([q_size, kv_size, kv_size], dim=-1)
+        else:
+            # Compute query, key, and value projections
+            xq, xk, xv = self.wq(x), self.wk(context), self.wv(context)
+        # Reshape projections
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, context_len, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, context_len, self.n_local_kv_heads, self.head_dim)
+        # QK normalization
+        if self.use_qk_normalization:
+            xq = self.q_norm(xq)
+            xk = self.k_norm(xk)
+        # Apply rotary positional embeddings to queries and keys
+        # Only apply RoPE to self-attention!
+        if self.attn_type in ["self", "full"]:
+            xq, xk = rope(xq, xk, input_pos, seqlen)
+        xq, xk, xv = map(lambda x: x.transpose(1, 2), (xq, xk, xv))
+        # xq: (bs, n_local_heads, seqlen, head_dim)
+        # xk: (bs, n_kv_heads, cache_len + context_len, head_dim)
+        # xv: (bs, n_kv_heads, cache_len + context_len, head_dim)
+        if self.attn_type == "self":
+            # Update cache with current key and value tensors
+            assert input_pos is not None
+            self.cache_k[:bsz, :, input_pos] = xk
+            self.cache_v[:bsz, :, input_pos] = xv
+            keys, values = (
+                self.cache_k[:bsz, :, :],
+                self.cache_v[:bsz, :, :],
+            )
+        else:
+            keys, values = xk, xv
+        # Repeat keys and values if necessary
+        keys = keys.repeat_interleave(self.n_rep, dim=1)  # (bs, n_local_heads, cache_len + context_len, head_dim)
+        values = values.repeat_interleave(self.n_rep, dim=1)  # (bs, n_local_heads, cache_len + context_len, head_dim)
+        # For self-attention, `is_causal` should be set to False when KV cache is pre-computed and used,
+        # since the masking is handled outside this attention module.
+        # For cross-attention, it's always full-attn without causal mask
+        is_causal = False
+        output = scaled_dot_product_attention(
+            xq,
+            keys,
+            values,
+            head_dim=self.head_dim,
+            mask=mask,
+            is_causal=is_causal,
+            dropout_p=0.0,
+        )
+        output = output.view(bsz, seqlen, -1)
+        output = self.wo(output)
+        return output
+def scaled_dot_product_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    head_dim: int,
+    mask: Optional[torch.Tensor] = None,
+    is_causal: Optional[bool] = None,
+    dropout_p: float = 0.0,
+) -> torch.Tensor:
+    """
+    PyTorch's native implementation of Flash Attention 2.
+    If `is_causal` is given, then the causal attention mask is applied accordingly:
+    - If `is_causal` is True, the standard upper-left causal attention masking is applied.
+    - If `is_causal` is False, no attention mask is applied, unless an explicit mask tensor is
+      provided (i.e., `mask is not None`).
+    If `is_causal` is not given (i.e., `is_causal is None`), then the attention mask is applied
+    based on the provided mask tensor:
+    - If no explicit attention mask is given (i.e., `mask is None`), `is_causal` is set to True,
+    leading to the standard upper-left causal attention masking.
+    - If an attention mask is given (i.e., `mask is not None`), the provided mask is used,
+    and `is_causal` is set to False.
+    Args:
+        q (torch.Tensor): Query tensor
+        k (torch.Tensor): Key tensor
+        v (torch.Tensor): Value tensor
+        head_dim (int): Dimension of each attention head
+        mask (Optional[torch.Tensor], optional): Attention mask. Defaults to None.
+        is_causal (Optional[bool], optional): Whether to apply causal attention mask. Defaults to None.
+        dropout_p (float, optional): Dropout rate. Defaults to 0.0.
+    Returns:
+        torch.Tensor: Output tensor after applying scaled dot-product attention
+    """
+    scale = 1.0 / math.sqrt(head_dim)
+    if is_causal is None:
+        is_causal = mask is None
+    y = torch.nn.functional.scaled_dot_product_attention(
+        q,
+        k,
+        v,
+        attn_mask=mask,
+        dropout_p=dropout_p,
+        scale=scale,
+        is_causal=is_causal,
+    )
+    return y.transpose(1, 2).contiguous()

ar_modules_embedding.py ADDED Viewed

	@@ -0,0 +1,491 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+from einops import rearrange, repeat
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def _rotate_half_te(x: torch.Tensor) -> torch.Tensor:
+    """
+    change sign so the last dimension becomes [-odd, +even].
+    Adopted from TransformerEngine.
+    Source: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py
+    """
+    x = x.view(x.shape[:-1] + torch.Size((2, x.shape[-1] // 2)))
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+def _apply_rotary_pos_emb_te(
+    t: torch.Tensor,
+    cos_freqs: torch.Tensor,
+    sin_freqs: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply rotary positional embedding tensor to the input tensor.
+    Adopted from TransformerEngine.
+    Source: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py
+    Parameters
+    ----------
+    t: torch.Tensor
+        Input tensor of shape `[b, s, h, d]`, on which
+        rotary positional embedding will be applied.
+    cos_freqs: torch.Tensor
+        Cosine component of rotary positional embedding tensor of shape `[s, 1, 1, d]` and dtype 'float',
+    sin_freqs: torch.Tensor
+        Sine component of rotary positional embedding tensor of shape `[s, 1, 1, d]` and dtype 'float',
+    """
+    rot_dim = cos_freqs.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    t = (t * cos_freqs) + (_rotate_half_te(t) * sin_freqs)
+    output = torch.cat((t, t_pass), dim=-1)
+    return output
+class RotaryPositionEmbedding(torch.nn.Module):
+    """
+    Rotary Position Embedding module as described in the paper:
+    https://arxiv.org/abs/2104.09864
+    This module implements rotary positional embeddings, which are used to
+    enhance the performance of transformer models.
+    Args:
+        dim (int): Dimensionality of the input tensor.
+        max_position_embeddings (Optional[int]): Maximum position embeddings.
+        original_max_position_embeddings (Optional[int]): Original maximum position embeddings.
+        rope_theta (Optional[float]): Base for the frequency calculation.
+        apply_yarn (Optional[bool]): Whether to apply YaRN (Yet another Rotary).
+        scale (Optional[int]): Scaling factor for the frequency calculation.
+        extrapolation_factor (Optional[int]): Extrapolation factor for the frequency extension.
+        attn_factor (Optional[int]): Attention factor for the frequency calculation.
+        beta_fast (Optional[int]): Fast beta value for the YaRN frequency calculation.
+        beta_slow (Optional[int]): Slow beta value for the YaRN frequency calculation.
+        rope_dim (Optional[str]): Dimensionality of the RoPE. Choices: "1D", "2D", "3D".
+        latent_shape (Optional[List[int]]): Shape of the latent tensor for video or image inputs.
+        original_latent_shape (Optional[List[int]]): Original shape of the latent tensor for video or image inputs.
+        pad_to_multiple_of (Optional[int]): Pad the position embedding to a multiple of this value.
+    """
+    def __init__(
+        self,
+        dim: int,
+        max_position_embeddings: Optional[int] = None,
+        original_max_position_embeddings: Optional[int] = None,
+        rope_theta: Optional[float] = 10000.0,
+        apply_yarn: Optional[bool] = False,
+        scale: Optional[int] = None,
+        extrapolation_factor: Optional[int] = 1,
+        attn_factor: Optional[int] = 1,
+        beta_fast: Optional[int] = 32,
+        beta_slow: Optional[int] = 1,
+        rope_dim: Optional[str] = "1D",
+        latent_shape: Optional[List[int]] = None,
+        original_latent_shape: Optional[List[int]] = None,
+        pad_to_multiple_of: Optional[int] = None,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.rope_theta = rope_theta
+        self.apply_yarn = apply_yarn
+        self.scale = scale
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = 1.0
+        self.rope_dim = rope_dim
+        self.latent_shape = latent_shape
+        self.original_latent_shape = original_latent_shape
+        self.pad_to_multiple_of = pad_to_multiple_of
+        self.get_inv_freq(torch.cuda.current_device())
+    def get_mscale(self, scale: float = 1.0) -> float:
+        """Get the magnitude scaling factor for YaRN."""
+        if scale <= 1:
+            return 1.0
+        return 0.1 * math.log(scale) + 1.0
+    def forward(self, seq_len: Optional[int] = None) -> torch.Tensor:
+        """
+        Forward pass for the rotary position embedding.
+        Args:
+            seq_len (Optional[int]): Length of the sequence.
+        Returns:
+            torch.Tensor: The computed frequencies for positional embedding.
+        """
+        if self.apply_yarn and seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+        self.freqs = self.compute_freqs()
+        return self.freqs
+    def compute_freqs(
+        self,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute the spatial frequencies for the latent tensor."""
+        self.seq = torch.arange(self.max_seq_len_cached, dtype=torch.float).cuda()
+        if self.rope_dim == "1D":
+            emb = torch.einsum("i,j->ij", self.seq, self.inv_freq)
+        elif self.rope_dim == "2D":
+            H, W = self.latent_shape
+            half_emb_h = torch.outer(self.seq[:H], self.spatial_inv_freq)
+            half_emb_w = torch.outer(self.seq[:W], self.spatial_inv_freq)
+            emb = torch.cat(
+                [
+                    repeat(half_emb_h, "h d -> h w d", w=W),
+                    repeat(half_emb_w, "w d -> h w d", h=H),
+                ]
+                * 2,
+                dim=-1,
+            )
+            emb = rearrange(emb, "h w d -> (h w) 1 1 d").float()
+        elif self.rope_dim == "3D":
+            T, H, W = self.latent_shape
+            half_emb_t = torch.outer(self.seq[:T], self.temporal_inv_freq)
+            half_emb_h = torch.outer(self.seq[:H], self.spatial_inv_freq)
+            half_emb_w = torch.outer(self.seq[:W], self.spatial_inv_freq)
+            emb = torch.cat(
+                [
+                    repeat(half_emb_t, "t d -> t h w d", h=H, w=W),
+                    repeat(half_emb_h, "h d -> t h w d", t=T, w=W),
+                    repeat(half_emb_w, "w d -> t h w d", t=T, h=H),
+                ]
+                * 2,
+                dim=-1,
+            )
+            emb = rearrange(emb, "t h w d -> (t h w) 1 1 d").float()
+        else:
+            raise ValueError(f"Invalid RoPE dimensionality: {self.rope_dim}")
+        return emb
+    def get_scale_factors(self, inv_freq: torch.Tensor, original_seq_len: int) -> torch.Tensor:
+        """Get the scale factors for YaRN."""
+        # Calculate the high and low frequency cutoffs for YaRN. Note: `beta_fast` and `beta_slow` are called
+        # `high_freq_factor` and `low_freq_factor` in the Llama 3.1 RoPE scaling code.
+        high_freq_cutoff = 2 * math.pi * self.beta_fast / original_seq_len
+        low_freq_cutoff = 2 * math.pi * self.beta_slow / original_seq_len
+        # Obtain a smooth mask that has a value of 0 for low frequencies and 1 for high frequencies, with linear
+        # interpolation in between.
+        smooth_mask = torch.clamp((inv_freq - low_freq_cutoff) / (high_freq_cutoff - low_freq_cutoff), min=0, max=1)
+        # For low frequencies, we scale the frequency by 1/self.scale. For high frequencies, we keep the frequency.
+        scale_factors = (1 - smooth_mask) / self.scale + smooth_mask
+        return scale_factors
+    def get_inv_freq(self, device: torch.device) -> None:
+        """Get the inverse frequency."""
+        if self.rope_dim == "1D":
+            assert self.max_position_embeddings is not None, "Max position embeddings required."
+            inv_freq = 1.0 / (
+                self.rope_theta ** (torch.arange(0, self.dim, 2, dtype=torch.float32, device=device) / self.dim)
+            )
+            if self.apply_yarn:
+                assert self.original_max_position_embeddings is not None, "Original max position embeddings required."
+                assert self.beta_slow is not None, "Beta slow value required."
+                assert self.beta_fast is not None, "Beta fast value required."
+                scale_factors = self.get_scale_factors(inv_freq, self.original_max_position_embeddings)
+                # Apply the scaling factors to inv_freq.
+                inv_freq = inv_freq * scale_factors
+                # Set the magnitude scaling factor.
+                self.mscale = float(self.get_mscale(self.scale) * self.attn_factor)
+            self.max_seq_len_cached = self.max_position_embeddings
+            self.inv_freq = inv_freq
+        elif self.rope_dim == "2D":
+            assert self.latent_shape is not None, "Latent shape required."
+            dim_h = self.dim // 2
+            spatial_inv_freq = 1.0 / (
+                self.rope_theta ** torch.arange(0, dim_h, 2, dtype=torch.float32, device=device) / dim_h
+            )
+            if self.apply_yarn:
+                assert self.original_latent_shape is not None, "Original latent shape required."
+                assert self.beta_slow is not None, "Beta slow value required."
+                assert self.beta_fast is not None, "Beta fast value required."
+                scale_factors = self.get_scale_factors(spatial_inv_freq, self.original_latent_shape[0])
+                spatial_inv_freq = spatial_inv_freq * scale_factors
+                self.mscale = float(self.get_mscale(self.scale) * self.attn_factor)
+            self.spatial_inv_freq = spatial_inv_freq
+            self.max_seq_len_cached = max(self.latent_shape)
+        elif self.rope_dim == "3D":
+            assert self.latent_shape is not None, "Latent shape required."
+            dim_h = self.dim // 6 * 2
+            dim_t = self.dim - 2 * dim_h
+            self.dim_spatial_range = torch.arange(0, dim_h, 2)[: (dim_h // 2)].float().to(device) / dim_h
+            spatial_inv_freq = 1.0 / (self.rope_theta**self.dim_spatial_range)
+            self.dim_temporal_range = torch.arange(0, dim_t, 2)[: (dim_t // 2)].float().to(device) / dim_t
+            temporal_inv_freq = 1.0 / (self.rope_theta**self.dim_temporal_range)
+            if self.apply_yarn:
+                assert self.original_latent_shape is not None, "Original latent shape required."
+                assert self.beta_slow is not None, "Beta slow value required."
+                assert self.beta_fast is not None, "Beta fast value required."
+                scale_factors_spatial = self.get_scale_factors(spatial_inv_freq, self.original_latent_shape[1])
+                spatial_inv_freq = spatial_inv_freq * scale_factors_spatial
+                scale_factors_temporal = self.get_scale_factors(temporal_inv_freq, self.original_latent_shape[0])
+                temporal_inv_freq = temporal_inv_freq * scale_factors_temporal
+                self.mscale = float(self.get_mscale(self.scale) * self.attn_factor)
+            self.spatial_inv_freq = spatial_inv_freq
+            self.temporal_inv_freq = temporal_inv_freq
+            self.max_seq_len_cached = max(self.latent_shape)
+        else:
+            raise ValueError(f"Invalid RoPE dimensionality: {self.rope_dim}")
+        self.freqs = self.compute_freqs()
+class RotaryPositionEmbeddingPytorchV2(RotaryPositionEmbedding):
+    """
+    Rotary Position Embedding that works in the same way as the TransformerEngine RoPE
+    (https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py)
+    """
+    def __init__(
+        self,
+        seq_len: int,
+        training_type: str = None,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+        emb = self.create_rope_freqs(seq_len=seq_len, training_type=training_type)
+        emb = emb.transpose(0, 1).contiguous()  # [seq, 1, 1, dim] -> [1, seq, 1, dim]
+        assert emb.shape[0] == 1 and emb.shape[2] == 1, f"emb shape: {emb.shape}"
+        # cos/sin first then dtype conversion for better precision
+        self.register_buffer("cos_cached", torch.cos(emb), persistent=False)
+        self.register_buffer("sin_cached", torch.sin(emb), persistent=False)
+    def create_rope_freqs(self, seq_len: int, training_type: str = None) -> torch.Tensor:
+        """
+        Create rotary position embedding frequencies.
+        Args:
+            seq_len (int): Sequence length of a sample.
+        Returns:
+            torch.Tensor: The computed positional embeddings.
+        """
+        if self.rope_dim == "1D":
+            freqs = super().forward(seq_len=seq_len)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            emb = emb.reshape(emb.size(0), 1, 1, emb.size(1))
+        elif self.rope_dim in ["2D", "3D"]:
+            emb = super().forward(seq_len=seq_len)
+            if training_type == "text_to_video":
+                # since we added <bov> token at the beginning of the video for text2world, we also extend the position embedding by one token in the beginning
+                bov_pe = torch.zeros((1, *emb.shape[1:]), device=emb.device)
+                emb = torch.cat((bov_pe, emb), dim=0)
+        else:
+            raise ValueError(f"Invalid RoPE dimensionality: {self.rope_dim}")
+        if self.pad_to_multiple_of is not None and emb.shape[0] % self.pad_to_multiple_of != 0:
+            # Round up to the nearest multiple of pad_to_multiple_of
+            pad_len = self.pad_to_multiple_of - emb.shape[0] % self.pad_to_multiple_of
+            emb = torch.cat((emb, torch.zeros((pad_len, *emb.shape[1:]), device=emb.device)), dim=0)
+        return emb
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor, input_pos: Optional[torch.Tensor] = None, seq_len: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if q.dtype != self.cos_cached.dtype:
+            self.cos_cached = self.cos_cached.to(q.dtype)
+            self.sin_cached = self.sin_cached.to(q.dtype)
+        cos_emb = self.cos_cached
+        sin_emb = self.sin_cached
+        if input_pos is not None:
+            cos_emb = cos_emb[:, input_pos, :, :]
+            sin_emb = sin_emb[:, input_pos, :, :]
+        elif seq_len is not None:
+            cos_emb = cos_emb[:, :seq_len, :, :]
+            sin_emb = sin_emb[:, :seq_len, :, :]
+        q = _apply_rotary_pos_emb_te(q, cos_emb, sin_emb)
+        k = _apply_rotary_pos_emb_te(k, cos_emb, sin_emb)
+        return q, k
+class RotaryPositionEmbeddingPytorchV1(RotaryPositionEmbedding):
+    """
+    Rotary Position Embedding that works in the same way as
+    mistral_inference (https://github.com/mistralai/mistral-inference/blob/main/src/mistral_inference/rope.py)
+    or llama3 (https://github.com/meta-llama/llama3/blob/main/llama/model.py)
+    """
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+        if self.rope_dim == "1D":
+            emb = torch.stack((self.freqs, self.freqs), dim=-1).reshape(*self.freqs.shape[:-1], -1)
+        elif self.rope_dim in ["2D", "3D"]:
+            emb = rearrange(self.freqs, "s 1 1 d -> s d").float()
+        self.register_buffer("cos_cached", (emb.cos() * self.mscale)[None, :, None, :], persistent=False)
+        self.register_buffer("sin_cached", (emb.sin() * self.mscale)[None, :, None, :], persistent=False)
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        """Rotate half the hidden dimensions of the input tensor."""
+        x_reshaped = x.reshape(*x.shape[:-1], -1, 2)
+        x1 = x_reshaped[..., 0]
+        x2 = x_reshaped[..., 1]
+        output = torch.stack((-x2, x1), dim=-1).reshape(*x.shape)
+        return output
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor, input_pos: Optional[torch.Tensor] = None, seq_len: Optional[int] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass for the rotary position embedding.
+        Args:
+            q (torch.Tensor): Query tensor.
+            k (torch.Tensor): Key tensor.
+            input_pos (Optional[torch.Tensor]): Starting position for the sequence.
+            seq_len (Optional[int]): Length of the sequence.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Rotated query and key tensors.
+        """
+        if self.apply_yarn and seq_len > self.max_seq_len_cached:
+            freqs = super().forward(seq_len)
+            if self.rope_dim == "1D":
+                emb = torch.stack((freqs, freqs), dim=-1).reshape(*freqs.shape[:-1], -1)
+            elif self.rope_dim in ["2D", "3D"]:
+                emb = rearrange(freqs, "s 1 1 d -> s d").float()
+            else:
+                raise ValueError(f"Invalid RoPE dimensionality: {self.rope_dim}")
+            self.register_buffer(
+                "cos_cached", (emb.cos() * self.mscale)[None, :, None, :].to(q.dtype), persistent=False
+            )
+            self.register_buffer(
+                "sin_cached", (emb.sin() * self.mscale)[None, :, None, :].to(q.dtype), persistent=False
+            )
+        if input_pos is not None:
+            cos_cached = self.cos_cached[:, input_pos]
+            sin_cached = self.sin_cached[:, input_pos]
+        else:
+            assert (
+                self.cos_cached.shape[1] >= seq_len
+            ), f"Invalid sequence length; cos_cached.shape {self.cos_cached.shape}, seq_len {seq_len}."
+            cos_cached = self.cos_cached[:, :seq_len, ...]
+            sin_cached = self.sin_cached[:, :seq_len, ...]
+        xq = q * cos_cached + self.rotate_half(q) * sin_cached
+        xk = k * cos_cached + self.rotate_half(k) * sin_cached
+        return xq.type_as(q), xk.type_as(k)
+class SinCosPosEmbAxisTE(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        latent_shape: Optional[List[int]] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        dtype: torch.dtype = torch.bfloat16,
+        **kwargs,
+    ):
+        """
+        Args:
+            dim (int): Dimensionality of the input tensor.
+            latent_shape (Optional[List[int]]): Shape of the latent tensor for video or image inputs.
+            pad_to_multiple_of (Optional[int]): Pad the position embedding to a multiple of this value.
+            dtype (torch.dtype): Data type of the position embedding tensor.
+        """
+        super().__init__()
+        dim_h = dim // 6 * 2
+        dim_w = dim_h
+        dim_t = dim - 2 * dim_h
+        assert dim == dim_h + dim_w + dim_t, f"bad dim: {dim} != {dim_h} + {dim_w} + {dim_t}"
+        self.latent_shape = latent_shape
+        T, H, W = latent_shape
+        emb_h = get_1d_sincos_pos_embed_from_grid(dim_h, pos=np.arange(H))
+        emb_w = get_1d_sincos_pos_embed_from_grid(dim_w, pos=np.arange(W))
+        emb_t = get_1d_sincos_pos_embed_from_grid(dim_t, pos=np.arange(T))
+        self.register_buffer("pos_emb_h", torch.from_numpy(emb_h).to(dtype=dtype, device="cuda"), persistent=False)
+        self.register_buffer("pos_emb_w", torch.from_numpy(emb_w).to(dtype=dtype, device="cuda"), persistent=False)
+        self.register_buffer("pos_emb_t", torch.from_numpy(emb_t).to(dtype=dtype, device="cuda"), persistent=False)
+        self.pad_to_multiple_of = pad_to_multiple_of
+    def forward(
+        self,
+        training_type: str = None,
+    ) -> torch.Tensor:
+        T, H, W = self.latent_shape
+        emb = torch.cat(
+            [
+                repeat(self.pos_emb_t, "t d-> t h w d", h=H, w=W),
+                repeat(self.pos_emb_h, "h d-> t h w d", t=T, w=W),
+                repeat(self.pos_emb_w, "w d-> t h w d", t=T, h=H),
+            ],
+            dim=-1,
+        )
+        # Flatten the T,H,W dimensions
+        emb = rearrange(emb, "t h w d -> (t h w) d")
+        if training_type == "text_to_video":
+            bov_pe = torch.zeros((1, *emb.shape[1:]), device=emb.device, dtype=emb.dtype)
+            emb = torch.cat((bov_pe, emb), dim=0)
+        if self.pad_to_multiple_of is not None and emb.shape[0] % self.pad_to_multiple_of != 0:
+            pad_len = self.pad_to_multiple_of - emb.shape[0] % self.pad_to_multiple_of
+            emb = torch.cat((emb, torch.zeros((pad_len, *emb.shape[1:]), device=emb.device, dtype=emb.dtype)), dim=0)
+        seq_len, dim = emb.shape
+        emb = emb.reshape(1, seq_len, dim)
+        return emb

ar_modules_mlp.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class MLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        """
+        Initializes the multilayer perceptron (MLP) module.
+        Args:
+            dim: The input and output dimensionality.
+            hidden_dim: The dimensionality of the hidden layer.
+        """
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Performs the forward pass of the MLP module.
+        Args:
+            x: The input tensor of shape (batch_size, dim).
+        Returns:
+            The output tensor of shape (batch_size, dim).
+        """
+        output = self.w2(F.silu(self.w1(x)) * self.w3(x))
+        return output

ar_modules_normalization.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+def create_norm(norm_type: str, dim: int, eps: float = 1e-6):
+    """
+    Creates the specified normalization layer based on the norm_type.
+    Adopted from TorchTriton: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/norms.py
+    Args:
+        norm_type (str): The type of normalization layer to create.
+            Supported types: 1. rmsnorm 2. fused_rmsnorm 3. layernorm 4. np_layernorm
+        dim (int): The dimension of the normalization layer.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+    Returns:
+        The created normalization layer.
+    Raises:
+        NotImplementedError: If an unknown norm_type is provided.
+    """
+    norm_type = norm_type.lower()  # Normalize to lowercase
+    if norm_type == "layernorm":
+        return nn.LayerNorm(dim, eps=eps, bias=False)
+    elif norm_type == "np_layernorm":
+        return nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+    elif norm_type == "rmsnorm":
+        return RMSNorm(dim, eps=eps, compile=False)
+    elif norm_type == "compiled_rmsnorm":
+        return RMSNorm(dim, eps=eps, compile=True)
+    elif norm_type == "fused_rmsnorm":
+        raise NotImplementedError("Fused RMSNorm is not supported yet.")
+    else:
+        raise NotImplementedError(f"Unknown norm_type: '{norm_type}'")
+class RMSNorm(nn.Module):
+    """
+    Initialize the RMSNorm normalization layer.
+    Reference implementation: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/norms.py
+    Args:
+        dim (int): The dimension of the input tensor.
+        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        compile (bool, optional): Whether to compile the forward function. Default is False.
+    Attributes:
+        eps (float): A small value added to the denominator for numerical stability.
+        weight (nn.Parameter): Learnable scaling parameter.
+    """
+    def __init__(self, dim: int, eps: float = 1e-6, compile: bool = False):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.rmsnorm_fn = torch.compile(self.compute_rmsnorm, fullgraph=True) if compile else self.compute_rmsnorm
+    @staticmethod
+    def compute_rmsnorm(x: torch.Tensor, weight: torch.Tensor, eps: float):
+        def _norm(x, eps):
+            # Computes the root-mean-square norm of the input tensor.
+            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+        output = _norm(x.float(), eps).type_as(x)
+        return output * weight
+    def forward(self, x: torch.Tensor):
+        return self.rmsnorm_fn(x, self.weight, self.eps)
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)

ar_networks.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import namedtuple
+import torch
+from torch import nn
+from .ar_tokenizer_modules import CausalConv3d, DecoderFactorized, EncoderFactorized
+from .ar_tokenizer_quantizers import FSQuantizer
+from .log import log
+NetworkEval = namedtuple("NetworkEval", ["reconstructions", "quant_loss", "quant_info"])
+class CausalDiscreteVideoTokenizer(nn.Module):
+    def __init__(self, z_channels: int, z_factor: int, embedding_dim: int, **kwargs) -> None:
+        super().__init__()
+        self.name = kwargs.get("name", "CausalDiscreteVideoTokenizer")
+        self.embedding_dim = embedding_dim
+        self.encoder = EncoderFactorized(z_channels=z_factor * z_channels, **kwargs)
+        self.decoder = DecoderFactorized(z_channels=z_channels, **kwargs)
+        self.quant_conv = CausalConv3d(z_factor * z_channels, embedding_dim, kernel_size=1, padding=0)
+        self.post_quant_conv = CausalConv3d(embedding_dim, z_channels, kernel_size=1, padding=0)
+        self.quantizer = FSQuantizer(**kwargs)
+        num_parameters = sum(param.numel() for param in self.parameters())
+        log.debug(f"model={self.name}, num_parameters={num_parameters:,}")
+        log.debug(f"z_channels={z_channels}, embedding_dim={self.embedding_dim}.")
+    def to(self, *args, **kwargs):
+        setattr(self.quantizer, "dtype", kwargs.get("dtype", torch.bfloat16))
+        return super(CausalDiscreteVideoTokenizer, self).to(*args, **kwargs)
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return self.quantizer(h)
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        return self.decoder(quant)
+    def forward(self, input):
+        quant_info, quant_codes, quant_loss = self.encode(input)
+        reconstructions = self.decode(quant_codes)
+        if self.training:
+            return dict(reconstructions=reconstructions, quant_loss=quant_loss, quant_info=quant_info)
+        return NetworkEval(reconstructions=reconstructions, quant_loss=quant_loss, quant_info=quant_info)

ar_tokenizer.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import defaultdict
+from typing import Optional
+import torch
+from einops import rearrange
+from .ar_config_tokenizer import TokenizerConfig
+from .lazy_config_init import instantiate as lazy_instantiate
+def update_vocab_size(
+    existing_vocab_size,
+    to_be_added_vocab_size,
+    training_type,
+    add_special_tokens,
+    video_special_tokens={},
+):
+    # New vocab size
+    if add_special_tokens:
+        existing_vocab_size += to_be_added_vocab_size + len(video_special_tokens)
+    # For text_to_video, we add one <bov> special token at the beginning of the video
+    elif training_type == "text_to_video":
+        existing_vocab_size += to_be_added_vocab_size + 1
+    else:
+        existing_vocab_size += to_be_added_vocab_size
+    return existing_vocab_size
+class DiscreteMultimodalTokenizer:
+    def __init__(self, tokenizer_config: TokenizerConfig):
+        self.tokenizer_config = tokenizer_config
+        self.vocab_size = 0
+        self.total_seq_len = tokenizer_config.seq_len
+        self.pad_to_multiple_of = tokenizer_config.pad_to_multiple_of
+        self.training_type = tokenizer_config.training_type
+        assert self.training_type in [
+            "text_only",
+            "text_to_video",
+            "video_to_video",
+            "image_text_interleaved",
+        ], f"{self.training_type} not supported"
+        self._build_text_tokenizer()
+        self._build_video_tokenizer()
+    def _build_text_tokenizer(self):
+        r"""Function to initialize the text tokenizer model."""
+        if self.tokenizer_config.text_tokenizer is not None:
+            self.text_tokenizer = lazy_instantiate(self.tokenizer_config.text_tokenizer.config)
+            self.vocab_size += self.tokenizer_config.text_tokenizer.vocab_size
+        else:
+            self.text_tokenizer = None
+    def _build_video_tokenizer(self):
+        r"""Function to initialize the video tokenizer model."""
+        if self.tokenizer_config.video_tokenizer is not None:
+            self.video_tokenizer = lazy_instantiate(self.tokenizer_config.video_tokenizer.config)
+            self.video_tokenizer = self.video_tokenizer.to("cuda")
+            self.video_vocab_size = self.tokenizer_config.video_tokenizer.vocab_size
+            special_token_offset = (
+                self.tokenizer_config.video_tokenizer.tokenizer_offset
+                + self.tokenizer_config.video_tokenizer.vocab_size
+            )
+            self.video_special_tokens = {
+                "<|begin_of_video|>": special_token_offset,
+                "<|end_of_video|>": special_token_offset + 1,
+                "<|pad_token_video|>": special_token_offset + 2,
+            }
+            self.vocab_size = update_vocab_size(
+                existing_vocab_size=self.vocab_size,
+                to_be_added_vocab_size=self.tokenizer_config.video_tokenizer.vocab_size,
+                training_type=self.training_type,
+                add_special_tokens=self.tokenizer_config.add_special_tokens,
+                video_special_tokens=self.video_special_tokens,
+            )
+        else:
+            self.video_tokenizer = None
+    @property
+    def pad_id(self):
+        r"""Returns the pad_id."""
+        if self.training_type == "text_only" or self.training_type == "image_text_interleaved":
+            pad_id = self.text_tokenizer.pad_id
+        elif self.training_type in ["text_to_video", "video_to_video"]:
+            pad_id = self.video_special_tokens["<|pad_token_video|>"]
+        else:
+            raise ValueError(f"training_type {self.training_type} not defined")
+        return pad_id
+    @property
+    def ignore_index(self):
+        r"""Returns which token should be ignored during loss computation."""
+        if self.training_type == "text_only" or self.training_type == "image_text_interleaved":
+            if self.text_tokenizer.pad_id == self.text_tokenizer.eos_id:
+                # If the PAD token is the same as the EOS token, we do not ignore it during loss
+                # computation, since we want the model to be able to predict EOS tokens in inference.
+                # The PyTorch default ignore_index for the cross-entropy loss is -100.
+                ignore_index = -100
+            else:
+                ignore_index = self.text_tokenizer.pad_id
+        elif self.training_type in ["text_to_video", "video_to_video"]:
+            ignore_index = self.pad_id
+        else:
+            raise ValueError(f"training_type {self.training_type} not defined")
+        return ignore_index
+    @property
+    def stop_tokens(self):
+        r"""Returns the stop tokens."""
+        if self.training_type == "text_only" or self.training_type == "image_text_interleaved":
+            stop_tokens = self.text_tokenizer.stop_tokens
+        elif self.training_type in ["text_to_video", "video_to_video"]:
+            stop_tokens = set([self.video_special_tokens["<|end_of_video|>"]])
+        else:
+            raise ValueError(f"training_type {self.training_type} not defined")
+        return stop_tokens
+    def _tokenize_text(self, raw_text: list[str], max_text_seq_len: int = -1):
+        r"""Function to tokenize text.
+        Args:
+            raw_text (list[str]): List of input strings
+            max_text_seq_len (int): Maximum sequence length returned by text tokenizer
+        Returns:
+            text_tokens (list[list[int]]): List of text tokens
+        """
+        batch_size = len(raw_text)
+        text_tokens = [self.text_tokenizer.encode(raw_text[i], bos=True, eos=True) for i in range(batch_size)]
+        # Clipping the text tokens so that the sequence length does not exceed max_text_seq_len
+        if max_text_seq_len > -1:
+            for i in range(len(text_tokens)):
+                if len(text_tokens[i]) > max_text_seq_len:
+                    # Simply clip and add end of seq token
+                    text_tokens[i] = text_tokens[i][0 : max_text_seq_len - 1] + [self.text_tokenizer.eos_id]
+        return text_tokens
+    def _tokenize_class(self, cls_labels: list[str]):
+        r"""Function to tokenize the class label.
+        Args:
+            cls_labels (list[str]): List of class indices
+        Returns:
+            class_tokens (list[list[int]]): List of class tokens
+        """
+        # tokenizer_offset tells what offset should be added to the tokens.
+        # This is needed for vocab expansion.
+        class_tokens = [[int(x) + self.tokenizer_config.class_tokenizer.tokenizer_offset] for x in cls_labels]
+        return class_tokens
+    def _tokenize_video(self, videos: torch.Tensor, pixel_chunk_duration: Optional[int] = None):
+        r"""Function to tokenize video.
+        Args:
+            videos (torch.Tensor): Input video data tensor
+            pixel_chunk_duration (Optional[float]): Pixel chunk duration. If provided, we pass it to the video tokenizer.
+        Returns:
+            video_tokens (list[list[int]]): List of video tokens
+        """
+        video_tokens = []
+        batch_size = videos.shape[0]
+        quantized_out, _ = self.video_tokenizer.encode(videos, pixel_chunk_duration=pixel_chunk_duration)
+        indices = self.video_tokenizer.fsq_quantizer.codes_to_indices(quantized_out.permute(0, 2, 3, 4, 1))
+        # Flatten the indices
+        indices = rearrange(indices, "B T H W -> B (T H W)")
+        # tokenizer_offset tells what offset should be added to the tokens.
+        # This is needed for vocab expansion.
+        indices += self.tokenizer_config.video_tokenizer.tokenizer_offset
+        # Add begin and end of video tokens
+        bov_token = self.video_special_tokens["<|begin_of_video|>"]
+        eov_token = self.video_special_tokens["<|end_of_video|>"]
+        # Append bov and eov tokens
+        if self.tokenizer_config.add_special_tokens:
+            for i in range(batch_size):
+                video_tokens.append([bov_token] + indices[i].tolist() + [eov_token])
+        else:
+            if self.training_type == "text_to_video":
+                for i in range(batch_size):
+                    video_tokens.append([bov_token] + indices[i].tolist())
+            else:
+                for i in range(batch_size):
+                    video_tokens.append(indices[i].tolist())
+                    assert (
+                        len(video_tokens[-1]) == self.tokenizer_config.video_tokenizer.max_seq_len
+                    ), f"Expected {self.tokenizer_config.video_tokenizer.max_seq_len} tokens, got {len(video_tokens[-1])}; video shape: {videos.shape}"
+        return video_tokens
+    def tokenize(self, data_batch: dict):
+        r"""Function to tokenize data_dict.
+        Args:
+            data_batch (dict): Input data dict
+        Returns:
+            tokens (torch.LongTensor): Token tensor dict
+        """
+        if (
+            self.training_type in ["text_only", "image_text_interleaved"]
+            and not self.tokenizer_config.text_tokenizer.tokenize_here
+        ):
+            # In case of pre-computed tokens, just return the data_batch
+            return data_batch["tokens"], None
+        # Online tokenization
+        tokens = []
+        token_boundaries = defaultdict(list)
+        # Obtain maximum sequence length
+        max_text_seq_len = -1
+        max_visual_seq_len = -1
+        if self.training_type in ["text_to_video", "video_to_video"]:
+            max_visual_seq_len = self.tokenizer_config.video_tokenizer.max_seq_len
+        # If max visual sequence length is specified, make sure that text is clipped so that
+        # the full video/image is always seen.
+        if max_visual_seq_len > -1:
+            if self.tokenizer_config.add_special_tokens:
+                max_visual_seq_len = max_visual_seq_len + 2  # Two special tokens is for [bov, eov] or [boi, eoi] token
+            elif self.training_type == "text_to_video":
+                max_visual_seq_len = max_visual_seq_len + 1
+            else:
+                max_visual_seq_len = max_visual_seq_len
+            assert (
+                max_visual_seq_len <= self.total_seq_len
+            ), f"max_visual_seq_len ({max_visual_seq_len}) is greater that total sequence length ({self.total_seq_len})"
+            max_text_seq_len = self.total_seq_len - max_visual_seq_len
+        # Tokenize the text
+        if (
+            "text" in self.training_type
+            and self.text_tokenizer is not None
+            and self.tokenizer_config.text_tokenizer.tokenize_here
+        ):
+            key = self.tokenizer_config.text_tokenizer.data_key
+            batch_size = len(data_batch[key])
+            assert key in data_batch, f"Key {key} should be present in data for text tokenizer"
+            tokens = self._tokenize_text(data_batch["caption"], max_text_seq_len)
+            for i in range(batch_size):
+                token_boundaries["text"].append((0, len(tokens[i])))
+        else:
+            tokens = []
+            batch_size = None
+        # Tokenize the class label
+        if "class" in self.training_type and self.tokenizer_config.class_tokenizer is not None:
+            key = self.tokenizer_config.class_tokenizer.data_key
+            assert key in data_batch, f"Key {key} should be present in data for class tokenizer"
+            batch_size = len(data_batch[key]) if batch_size is None else batch_size
+            tokens_class = self._tokenize_class(data_batch[key])
+            if len(tokens) == 0:
+                tokens = tokens_class
+                for i in range(batch_size):
+                    token_boundaries["class"].append((0, len(tokens[i])))
+            else:
+                for i in range(batch_size):
+                    token_boundaries["class"].append((len(tokens[i]), len(tokens[i]) + len(tokens_class[i])))
+                    tokens[i] = tokens[i] + tokens_class[i]
+        # Tokenize the video
+        if self.video_tokenizer is not None and self.tokenizer_config.video_tokenizer.tokenize_here:
+            key = self.tokenizer_config.video_tokenizer.data_key
+            assert key in data_batch, f"Key {key} should be present in data for video tokenizer"
+            batch_size = len(data_batch[key]) if batch_size is None else batch_size
+            pixel_chunk_duration = (
+                None  # If not specified, we assume it's a video dataset and use the default chunk duration
+            )
+            dataset_name = data_batch.get("dataset_name", None)
+            if dataset_name is not None and dataset_name.startswith("image"):
+                # If it's an image dataset, we use a pixel chunk duration of 1
+                pixel_chunk_duration = 1
+            tokens_video = self._tokenize_video(data_batch[key], pixel_chunk_duration=pixel_chunk_duration)
+            if len(tokens) == 0:
+                tokens = tokens_video
+                for i in range(batch_size):
+                    token_boundaries["video"].append((0, len(tokens[i])))
+                    # [B,] each entry is ((0, len(tokens[i])))
+            else:
+                for i in range(batch_size):
+                    token_boundaries["video"].append((len(tokens[i]), len(tokens[i]) + len(tokens_video[i])))
+                    tokens[i] = tokens[i] + tokens_video[i]
+        # Combine the tokens and do padding
+        max_seq_len_in_batch = max([len(token) for token in tokens])
+        if self.pad_to_multiple_of is not None:
+            # Pad the sequence length to the nearest multiple of pad_to_multiple_of
+            max_seq_len_in_batch = ((max_seq_len_in_batch - 1) // self.pad_to_multiple_of + 1) * self.pad_to_multiple_of
+        pad_to_len = min(max_seq_len_in_batch, self.total_seq_len)
+        for i in range(len(tokens)):
+            if len(tokens[i]) < pad_to_len:
+                tokens[i] = tokens[i] + [self.pad_id] * (pad_to_len - len(tokens[i]))
+            else:
+                tokens[i] = tokens[i][0:pad_to_len]
+        # Convert it to long tensor
+        tokens = torch.LongTensor(tokens)
+        return tokens, token_boundaries

ar_tokenizer_image_text_tokenizer.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import torch
+import transformers
+from transformers import AutoImageProcessor
+from transformers.image_utils import ImageInput, is_valid_image, load_image
+from .ar_tokenizer_text_tokenizer import TextTokenizer
+from .log import log
+# Configuration for different vision-language models
+IMAGE_CONFIGS = {
+    "pixtral": {
+        "patch_size": 16,
+        "image_token": "[IMG]",
+        "image_break_token": "[IMG_BREAK]",
+        "image_end_token": "[IMG_END]",
+    }
+}
+# Chat template for Pixtral-12B-Instruct
+PIXTRAL_CHAT_TEMPLATE = '{%- if messages[0]["role"] == "system" %}\n    {%- set system_message = messages[0]["content"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message[\'role\'] == \'user\') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception(\'After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\') }}\n    {%- endif %}\n    {%- if message["role"] == "user" %}\n        {%- if loop.last and system_message is defined %}\n            {{- "[INST]" + system_message + "\n\n" }}\n        {%- else %}\n            {{- "[INST]" }}\n        {%- endif %}\n        {%- if message["content"] is not string %}\n            {%- for chunk in message["content"] %}\n                {%- if chunk["type"] == "text" %}\n                    {{- chunk["content"] }}\n                {%- elif chunk["type"] == "image" %}\n                    {{- "[IMG]" }}\n                {%- else %}\n                    {{- raise_exception("Unrecognized content type!") }}\n                {%- endif %}\n            {%- endfor %}\n        {%- else %}\n            {{- message["content"] }}\n        {%- endif %}\n        {{- "[/INST]" }}\n    {%- elif message["role"] == "assistant" %}\n        {{- message["content"] + eos_token}}\n    {%- else %}\n        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}\n    {%- endif %}\n{%- endfor %}'
+# Copied from transformers.models.pixtral.processing_pixtral.is_url
+def is_url(val) -> bool:
+    """Check if the given value is a URL."""
+    return isinstance(val, str) and val.startswith("http")
+# Copied from transformers.models.pixtral.processing_pixtral.is_image_or_image_url
+def is_image_or_image_url(elem):
+    """Check if the given element is an image or an image URL."""
+    return is_url(elem) or is_valid_image(elem)
+def load_image_list(
+    image_list: List[Union[str, "PIL.Image.Image"]], timeout: Optional[float] = None
+) -> List["PIL.Image.Image"]:
+    """
+    Load a list of images.
+    Args:
+        image_list (List[Union[str, PIL.Image.Image]]): The list of images to load.
+        timeout (Optional[float]): The timeout for loading the image.
+    Returns:
+        List[PIL.Image.Image]: The list of loaded images.
+    """
+    return [load_image(image, timeout=timeout) for image in image_list]
+class ImageTextTokenizer(TextTokenizer):
+    """
+    Image-text tokenizer class that extends the text tokenizer to support vision tokens as well.
+    """
+    def __init__(
+        self,
+        model_family: str,
+        is_instruct_model: bool,
+        tokenizer_path: str,
+        image_processor_path: str,
+    ):
+        """
+        Initialize the ImageTextTokenizer.
+        Args:
+            model_family (str): The model family.
+            is_instruct_model (bool): Whether the model is an instruct model.
+            s3_credential_path (str): The path to the s3 credential file. Defaults to "credentials/pbss_dir.secret".
+        Raises:
+            AssertionError: If the model family is not supported or if the transformers version is incompatible.
+        """
+        super().__init__(
+            model_family=model_family,
+            is_instruct_model=is_instruct_model,
+            local_path=tokenizer_path,
+        )
+        assert model_family in ["pixtral"], f"Unsupported model family: {model_family}"
+        if model_family == "pixtral":
+            # Need transformers>=4.45.0
+            assert transformers.__version__ >= "4.45.0", "Pixtral requires transformers>=4.45.0"
+            assert is_instruct_model, "Pixtral requires is_instruct_model=True"
+            if not hasattr(self.tokenizer, "chat_template") or self.tokenizer.chat_template is None:
+                setattr(self.tokenizer, "chat_template", PIXTRAL_CHAT_TEMPLATE)
+                log.debug(f"Pixtral tokenizer chat template set to: {PIXTRAL_CHAT_TEMPLATE}")
+        # Set up image-specific configurations
+        image_config = IMAGE_CONFIGS[model_family]
+        self.patch_size = image_config["patch_size"]
+        self.image_token = image_config["image_token"]
+        self.image_break_token = image_config["image_break_token"]
+        self.image_end_token = image_config["image_end_token"]
+        # Initialize the image processor
+        self.image_processor = AutoImageProcessor.from_pretrained(image_processor_path)
+    def encode(
+        self,
+        text: Union[str, List[str], List[int]],
+        *,  # Enforce keyword-only arguments
+        images: Optional[ImageInput] = None,
+        image_kwargs: Optional[Dict[str, Any]] = None,
+        **text_kwargs,
+    ) -> List[int]:
+        """
+        Process the images and return the tokenized images and text.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded.
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared.
+            image_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments for image processing.
+            **text_kwargs: Additional keyword arguments for text processing.
+        Returns:
+            A dictionary with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
+            - **pixel_values** -- Pixel values to be fed to a model.
+        Raises:
+            ValueError: If the input images are in an invalid format.
+        """
+        output_dict, image_inputs = {}, {}
+        if images is not None:
+            # Preprocess images
+            if is_image_or_image_url(images):
+                images = [[images]]
+            elif isinstance(images, list) and is_image_or_image_url(images[0]):
+                images = [images]
+            elif (
+                not isinstance(images, list)
+                and not isinstance(images[0], list)
+                and not is_image_or_image_url(images[0][0])
+            ):
+                raise ValueError(
+                    "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                )
+            # Load and process images
+            images = [load_image_list(sample) for sample in images]
+            image_kwargs = image_kwargs or {}
+            image_inputs = self.image_processor(images, patch_size=self.patch_size, return_tensors="np", **image_kwargs)
+            # Validate image inputs
+            assert "pixel_values" in image_inputs, "pixel_values not found in image_inputs"
+            assert "image_sizes" in image_inputs, "image_sizes not found in image_inputs"
+            assert len(image_inputs.keys()) == 2, "Only one key is allowed in image_inputs, got {}".format(
+                image_inputs.keys()
+            )
+            # Extract pixel values and image sizes
+            pixel_values = image_inputs["pixel_values"][0]
+            image_sizes = image_inputs["image_sizes"][0]
+            unique_sizes = np.unique(image_sizes, axis=0)
+            assert len(unique_sizes) == 1, "All images must have the same size, got {}".format(unique_sizes)
+            # Convert pixel values to PyTorch tensor
+            pixel_values = np.asarray(pixel_values)
+            pixel_values = torch.from_numpy(pixel_values)
+            output_dict["pixel_values"] = pixel_values
+            output_dict["image_sizes"] = image_sizes
+        # Expand image tokens in text
+        if image_inputs.get("pixel_values") is not None:
+            replace_strings = []
+            # Calculate the number of tokens needed for each image and create a placeholder
+            for image_size in image_sizes:
+                height, width = image_size
+                num_height_tokens = height // self.patch_size
+                num_width_tokens = width // self.patch_size
+                replace_tokens = [[self.image_token] * num_width_tokens + [self.image_break_token]] * num_height_tokens
+                # Flatten list
+                replace_tokens = [item for sublist in replace_tokens for item in sublist]
+                replace_tokens[-1] = self.image_end_token
+                replace_str = "".join(replace_tokens)
+                replace_strings.append(replace_str)
+                text = text.replace(self.image_token, "<placeholder>", 1)
+            # Replace placeholders with actual image token sequences
+            while "<placeholder>" in text:
+                replace_str = replace_strings.pop(0)
+                text = text.replace("<placeholder>", replace_str, 1)
+        # Encode the text
+        text_inputs = super(ImageTextTokenizer, self).encode(text, **text_kwargs)
+        output_dict["input_ids"] = text_inputs
+        return output_dict
+    def apply_chat_template(
+        self,
+        conversation: List[Dict[str, Any]] | List[List[Dict[str, Any]]],
+        *,
+        images: Optional[ImageInput] = None,
+        image_kwargs: Optional[Dict[str, Any]] = None,
+        add_generation_prompt: bool = False,
+        tokenize: bool = True,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_dict: bool = True,
+        return_assistant_tokens_mask: bool = False,
+        generation_prefix: str = "",
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        """
+        Apply the chat template to the conversation.
+        Args:
+            conversation (List[Dict[str, Any]] | List[List[Dict[str, Any]]]): The conversation to process.
+            images (Optional[ImageInput]): Images to include in the conversation.
+            image_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments for image processing.
+            add_generation_prompt (bool): Whether to add a generation prompt.
+            tokenize (bool): Whether to tokenize the output.
+            padding (bool): Whether to pad the output.
+            truncation (bool): Whether to truncate the output.
+            max_length (Optional[int]): Maximum length of the output.
+            return_tensors (Optional[str]): The type of tensors to return.
+            return_dict (bool): Whether to return a dictionary.
+            return_assistant_tokens_mask (bool): Whether to return the assistant tokens mask.
+            generation_prefix (str): Prefix to add before asking model to generate. Helpful to guide the generation. Defaults to "".
+            tokenizer_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments for the tokenizer.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            The processed conversation with applied chat template.
+        Raises:
+            AssertionError: If return_dict is False or if the conversation format is invalid.
+        """
+        assert return_dict, "return_dict must be True for ImageTextTokenizer"
+        assert isinstance(conversation, list), "conversation must be a list"
+        if isinstance(conversation[0], list):
+            assert len(conversation) == 1, "Only support single-conversation input, got {}".format(conversation)
+            conversation = conversation[0]
+        # Extract images from the conversation if not provided
+        if images is None:
+            images = []
+            for msg in conversation:
+                if msg.get("images", None) is not None:
+                    images = images + (msg["images"])
+            images = load_image_list(images)
+        # In case the input does not have images, will ignore
+        # Useful in feeding VLM inputs with and without images
+        if isinstance(images, list) and len(images) == 0:
+            images = None
+        # Apply the chat template to the text
+        text = super().apply_chat_template(
+            conversation,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            return_dict=False,
+            return_assistant_tokens_mask=return_assistant_tokens_mask,
+            generation_prefix=generation_prefix,
+            tokenizer_kwargs=tokenizer_kwargs,
+            **kwargs,
+        )
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        # Encode the text and images
+        output = self.encode(
+            text,
+            images=images,
+            image_kwargs=image_kwargs,
+            tokenize=tokenize,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            add_special_tokens=False,
+            return_tensors=return_tensors,
+            **tokenizer_kwargs,
+        )
+        return output
+    @property
+    def model_input_names(self):
+        """
+        Get the combined model input names from both the text tokenizer and image processor.
+        Returns:
+            List[str]: A list of unique input names.
+        """
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

ar_tokenizer_modules.py ADDED Viewed

	@@ -0,0 +1,560 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The model definition for 3D layers
+Adapted from: https://github.com/lucidrains/magvit2-pytorch/blob/9f49074179c912736e617d61b32be367eb5f993a/
+magvit2_pytorch/magvit2_pytorch.py#L889
+[MIT License Copyright (c) 2023 Phil Wang]
+https://github.com/lucidrains/magvit2-pytorch/blob/9f49074179c912736e617d61b32be367eb5f993a/LICENSE
+"""
+import math
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .ar_tokenizer_patching import Patcher3D, UnPatcher3D
+from .ar_tokenizer_utils import (
+    CausalNormalize,
+    batch2space,
+    batch2time,
+    cast_tuple,
+    is_odd,
+    nonlinearity,
+    replication_pad,
+    space2batch,
+    time2batch,
+)
+from .log import log
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        chan_in: int = 1,
+        chan_out: int = 1,
+        kernel_size: Union[int, Tuple[int, int, int]] = 3,
+        pad_mode: str = "constant",
+        **kwargs,
+    ):
+        super().__init__()
+        kernel_size = cast_tuple(kernel_size, 3)
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        assert is_odd(height_kernel_size) and is_odd(width_kernel_size)
+        dilation = kwargs.pop("dilation", 1)
+        stride = kwargs.pop("stride", 1)
+        time_stride = kwargs.pop("time_stride", 1)
+        time_dilation = kwargs.pop("time_dilation", 1)
+        padding = kwargs.pop("padding", 1)
+        self.pad_mode = pad_mode
+        time_pad = time_dilation * (time_kernel_size - 1) + (1 - time_stride)
+        self.time_pad = time_pad
+        self.spatial_pad = (padding, padding, padding, padding)
+        stride = (time_stride, stride, stride)
+        dilation = (time_dilation, dilation, dilation)
+        self.conv3d = nn.Conv3d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs)
+    def _replication_pad(self, x: torch.Tensor) -> torch.Tensor:
+        x_prev = x[:, :, :1, ...].repeat(1, 1, self.time_pad, 1, 1)
+        x = torch.cat([x_prev, x], dim=2)
+        padding = self.spatial_pad + (0, 0)
+        return F.pad(x, padding, mode=self.pad_mode, value=0.0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._replication_pad(x)
+        return self.conv3d(x)
+class CausalHybridUpsample3d(nn.Module):
+    def __init__(self, in_channels: int, spatial_up: bool = True, temporal_up: bool = True, **ignore_kwargs) -> None:
+        super().__init__()
+        self.conv1 = (
+            CausalConv3d(in_channels, in_channels, kernel_size=(3, 1, 1), stride=1, time_stride=1, padding=0)
+            if temporal_up
+            else nn.Identity()
+        )
+        self.conv2 = (
+            CausalConv3d(in_channels, in_channels, kernel_size=(1, 3, 3), stride=1, time_stride=1, padding=1)
+            if spatial_up
+            else nn.Identity()
+        )
+        self.conv3 = (
+            CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, time_stride=1, padding=0)
+            if spatial_up or temporal_up
+            else nn.Identity()
+        )
+        self.spatial_up = spatial_up
+        self.temporal_up = temporal_up
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.spatial_up and not self.temporal_up:
+            return x
+        # hybrid upsample temporally.
+        if self.temporal_up:
+            time_factor = 1.0 + 1.0 * (x.shape[2] > 1)
+            if isinstance(time_factor, torch.Tensor):
+                time_factor = time_factor.item()
+            x = x.repeat_interleave(int(time_factor), dim=2)
+            x = x[..., int(time_factor - 1) :, :, :]
+            x = self.conv1(x) + x
+        # hybrid upsample spatially.
+        if self.spatial_up:
+            x = x.repeat_interleave(2, dim=3).repeat_interleave(2, dim=4)
+            x = self.conv2(x) + x
+        # final 1x1x1 conv.
+        x = self.conv3(x)
+        return x
+class CausalHybridDownsample3d(nn.Module):
+    def __init__(
+        self, in_channels: int, spatial_down: bool = True, temporal_down: bool = True, **ignore_kwargs
+    ) -> None:
+        super().__init__()
+        self.conv1 = (
+            CausalConv3d(in_channels, in_channels, kernel_size=(1, 3, 3), stride=2, time_stride=1, padding=0)
+            if spatial_down
+            else nn.Identity()
+        )
+        self.conv2 = (
+            CausalConv3d(in_channels, in_channels, kernel_size=(3, 1, 1), stride=1, time_stride=2, padding=0)
+            if temporal_down
+            else nn.Identity()
+        )
+        self.conv3 = (
+            CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, time_stride=1, padding=0)
+            if spatial_down or temporal_down
+            else nn.Identity()
+        )
+        self.spatial_down = spatial_down
+        self.temporal_down = temporal_down
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.spatial_down and not self.temporal_down:
+            return x
+        # hybrid downsample spatially.
+        if self.spatial_down:
+            pad = (0, 1, 0, 1, 0, 0)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x1 = self.conv1(x)
+            x2 = F.avg_pool3d(x, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+            x = x1 + x2
+        # hybrid downsample temporally.
+        if self.temporal_down:
+            x = replication_pad(x)
+            x1 = self.conv2(x)
+            x2 = F.avg_pool3d(x, kernel_size=(2, 1, 1), stride=(2, 1, 1))
+            x = x1 + x2
+        # final 1x1x1 conv.
+        x = self.conv3(x)
+        return x
+class CausalResnetBlockFactorized3d(nn.Module):
+    def __init__(self, *, in_channels: int, out_channels: int = None, dropout: float, num_groups: int) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.norm1 = CausalNormalize(in_channels, num_groups=1)
+        self.conv1 = nn.Sequential(
+            CausalConv3d(in_channels, out_channels, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(out_channels, out_channels, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+        self.norm2 = CausalNormalize(out_channels, num_groups=num_groups)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = nn.Sequential(
+            CausalConv3d(out_channels, out_channels, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(out_channels, out_channels, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+        self.nin_shortcut = (
+            CausalConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        x = self.nin_shortcut(x)
+        return x + h
+class CausalAttnBlock(nn.Module):
+    def __init__(self, in_channels: int, num_groups: int) -> None:
+        super().__init__()
+        self.norm = CausalNormalize(in_channels, num_groups=num_groups)
+        self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        q, batch_size = time2batch(q)
+        k, batch_size = time2batch(k)
+        v, batch_size = time2batch(v)
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)
+        k = k.reshape(b, c, h * w)
+        w_ = torch.bmm(q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)
+        h_ = torch.bmm(v, w_)
+        h_ = h_.reshape(b, c, h, w)
+        h_ = batch2time(h_, batch_size)
+        h_ = self.proj_out(h_)
+        return x + h_
+class CausalTemporalAttnBlock(nn.Module):
+    def __init__(self, in_channels: int, num_groups: int) -> None:
+        super().__init__()
+        self.norm = CausalNormalize(in_channels, num_groups=num_groups)
+        self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        q, batch_size, height = space2batch(q)
+        k, _, _ = space2batch(k)
+        v, _, _ = space2batch(v)
+        bhw, c, t = q.shape
+        q = q.permute(0, 2, 1)  # (bhw, t, c)
+        k = k.permute(0, 2, 1)  # (bhw, t, c)
+        v = v.permute(0, 2, 1)  # (bhw, t, c)
+        w_ = torch.bmm(q, k.permute(0, 2, 1))  # (bhw, t, t)
+        w_ = w_ * (int(c) ** (-0.5))
+        # Apply causal mask
+        mask = torch.tril(torch.ones_like(w_))
+        w_ = w_.masked_fill(mask == 0, float("-inf"))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        h_ = torch.bmm(w_, v)  # (bhw, t, c)
+        h_ = h_.permute(0, 2, 1).reshape(bhw, c, t)  # (bhw, c, t)
+        h_ = batch2space(h_, batch_size, height)
+        h_ = self.proj_out(h_)
+        return x + h_
+class EncoderFactorized(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        channels: int,
+        channels_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        dropout: float,
+        resolution: int,
+        z_channels: int,
+        spatial_compression: int,
+        temporal_compression: int,
+        **ignore_kwargs,
+    ) -> None:
+        super().__init__()
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        # Patcher.
+        patch_size = ignore_kwargs.get("patch_size", 1)
+        self.patcher3d = Patcher3D(patch_size, ignore_kwargs.get("patch_method", "rearrange"))
+        in_channels = in_channels * patch_size * patch_size * patch_size
+        # calculate the number of downsample operations
+        self.num_spatial_downs = int(math.log2(spatial_compression)) - int(math.log2(patch_size))
+        assert (
+            self.num_spatial_downs <= self.num_resolutions
+        ), f"Spatially downsample {self.num_resolutions} times at most"
+        self.num_temporal_downs = int(math.log2(temporal_compression)) - int(math.log2(patch_size))
+        assert (
+            self.num_temporal_downs <= self.num_resolutions
+        ), f"Temporally downsample {self.num_resolutions} times at most"
+        # downsampling
+        self.conv_in = nn.Sequential(
+            CausalConv3d(in_channels, channels, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(channels, channels, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+        curr_res = resolution // patch_size
+        in_ch_mult = (1,) + tuple(channels_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = channels * in_ch_mult[i_level]
+            block_out = channels * channels_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(
+                    CausalResnetBlockFactorized3d(
+                        in_channels=block_in, out_channels=block_out, dropout=dropout, num_groups=1
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(
+                        nn.Sequential(
+                            CausalAttnBlock(block_in, num_groups=1), CausalTemporalAttnBlock(block_in, num_groups=1)
+                        )
+                    )
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                spatial_down = i_level < self.num_spatial_downs
+                temporal_down = i_level < self.num_temporal_downs
+                down.downsample = CausalHybridDownsample3d(
+                    block_in, spatial_down=spatial_down, temporal_down=temporal_down
+                )
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = CausalResnetBlockFactorized3d(
+            in_channels=block_in, out_channels=block_in, dropout=dropout, num_groups=1
+        )
+        self.mid.attn_1 = nn.Sequential(
+            CausalAttnBlock(block_in, num_groups=1), CausalTemporalAttnBlock(block_in, num_groups=1)
+        )
+        self.mid.block_2 = CausalResnetBlockFactorized3d(
+            in_channels=block_in, out_channels=block_in, dropout=dropout, num_groups=1
+        )
+        # end
+        self.norm_out = CausalNormalize(block_in, num_groups=1)
+        self.conv_out = nn.Sequential(
+            CausalConv3d(block_in, z_channels, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(z_channels, z_channels, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patcher3d(x)
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = self.down[i_level].downsample(h)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class DecoderFactorized(nn.Module):
+    def __init__(
+        self,
+        out_channels: int,
+        channels: int,
+        channels_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        dropout: float,
+        resolution: int,
+        z_channels: int,
+        spatial_compression: int,
+        temporal_compression: int,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        # UnPatcher.
+        patch_size = ignore_kwargs.get("patch_size", 1)
+        self.unpatcher3d = UnPatcher3D(patch_size, ignore_kwargs.get("patch_method", "rearrange"))
+        out_ch = out_channels * patch_size * patch_size * patch_size
+        # calculate the number of upsample operations
+        self.num_spatial_ups = int(math.log2(spatial_compression)) - int(math.log2(patch_size))
+        assert self.num_spatial_ups <= self.num_resolutions, f"Spatially upsample {self.num_resolutions} times at most"
+        self.num_temporal_ups = int(math.log2(temporal_compression)) - int(math.log2(patch_size))
+        assert (
+            self.num_temporal_ups <= self.num_resolutions
+        ), f"Temporally upsample {self.num_resolutions} times at most"
+        block_in = channels * channels_mult[self.num_resolutions - 1]
+        curr_res = (resolution // patch_size) // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        log.debug("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = nn.Sequential(
+            CausalConv3d(z_channels, block_in, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(block_in, block_in, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = CausalResnetBlockFactorized3d(
+            in_channels=block_in, out_channels=block_in, dropout=dropout, num_groups=1
+        )
+        self.mid.attn_1 = nn.Sequential(
+            CausalAttnBlock(block_in, num_groups=1), CausalTemporalAttnBlock(block_in, num_groups=1)
+        )
+        self.mid.block_2 = CausalResnetBlockFactorized3d(
+            in_channels=block_in, out_channels=block_in, dropout=dropout, num_groups=1
+        )
+        legacy_mode = ignore_kwargs.get("legacy_mode", False)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = channels * channels_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(
+                    CausalResnetBlockFactorized3d(
+                        in_channels=block_in, out_channels=block_out, dropout=dropout, num_groups=1
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(
+                        nn.Sequential(
+                            CausalAttnBlock(block_in, num_groups=1), CausalTemporalAttnBlock(block_in, num_groups=1)
+                        )
+                    )
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                # The layer index for temporal/spatial downsampling performed in the encoder should correspond
+                # to the layer index, inreverse order, where upsampling is performed in the decoder.
+                # If you've a pre-trained model, you can simply finetune.
+                # For example:
+                #   Input tensor = (1, 3, 17, 32, 32)
+                #   Patch size = 4 for 3D wavelet transform
+                #   Compression rate = (8x16x16)
+                #
+                # We expect successive downsampling in the encoder and upsampling in the decoder to be mirrored.
+                # ENCODER: `(...,5,8,8) -> (...,3,4,4) -> (...,3,2,2)`
+                # DECODER: `(...,3,2,2) -> (...,3,4,4) -> (...,5,8,8)`
+                #
+                # if legacy_mode is True, the temporal upsampling is not perfectly mirrored.
+                # ENCODER: `(...,5,8,8) -> (...,3,4,4) -> (...,3,2,2)`
+                # DECODER: `(...,3,2,2) -> (...,5,4,4) -> (...,5,8,8)`
+                #
+                # Most of the CV and DV tokenizers were trained before 09/01/2024 with upsampling that's not mirrored.
+                # Going forward, new CV/DV tokenizers will adopt `legacy_mode=False`, i.e. use mirrored upsampling.
+                i_level_reverse = self.num_resolutions - i_level - 1
+                if legacy_mode:
+                    temporal_up = i_level_reverse < self.num_temporal_ups
+                else:
+                    temporal_up = 0 < i_level_reverse < self.num_temporal_ups + 1
+                spatial_up = temporal_up or (
+                    i_level_reverse < self.num_spatial_ups and self.num_spatial_ups > self.num_temporal_ups
+                )
+                up.upsample = CausalHybridUpsample3d(block_in, spatial_up=spatial_up, temporal_up=temporal_up)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = CausalNormalize(block_in, num_groups=1)
+        self.conv_out = nn.Sequential(
+            CausalConv3d(block_in, out_ch, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(out_ch, out_ch, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+    def forward(self, z):
+        h = self.conv_in(z)
+        # middle block.
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # decoder blocks.
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        h = self.unpatcher3d(h)
+        return h

ar_tokenizer_patching.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The patcher and unpatcher implementation for 2D and 3D data."""
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+_WAVELETS = {
+    "haar": torch.tensor([0.7071067811865476, 0.7071067811865476]),
+    "rearrange": torch.tensor([1.0, 1.0]),
+}
+_PERSISTENT = False
+class Patcher(torch.nn.Module):
+    """A module to convert image tensors into patches using torch operations.
+    The main difference from `class Patching` is that this module implements
+    all operations using torch, rather than python or numpy, for efficiency purpose.
+    It's bit-wise identical to the Patching module outputs, with the added
+    benefit of being torch.jit scriptable.
+    """
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_method = patch_method
+        self.register_buffer("wavelets", _WAVELETS[patch_method], persistent=_PERSISTENT)
+        self.range = range(int(torch.log2(torch.tensor(self.patch_size)).item()))
+        self.register_buffer("_arange", torch.arange(_WAVELETS[patch_method].shape[0]), persistent=_PERSISTENT)
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x):
+        if self.patch_method == "haar":
+            return self._haar(x)
+        elif self.patch_method == "rearrange":
+            return self._arrange(x)
+        else:
+            raise ValueError("Unknown patch method: " + self.patch_method)
+    def _dwt(self, x, mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets
+        n = h.shape[0]
+        g = x.shape[1]
+        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = (h * ((-1) ** self._arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+        x = F.pad(x, pad=(n - 2, n - 1, n - 2, n - 1), mode=mode).to(dtype)
+        xl = F.conv2d(x, hl.unsqueeze(2), groups=g, stride=(1, 2))
+        xh = F.conv2d(x, hh.unsqueeze(2), groups=g, stride=(1, 2))
+        xll = F.conv2d(xl, hl.unsqueeze(3), groups=g, stride=(2, 1))
+        xlh = F.conv2d(xl, hh.unsqueeze(3), groups=g, stride=(2, 1))
+        xhl = F.conv2d(xh, hl.unsqueeze(3), groups=g, stride=(2, 1))
+        xhh = F.conv2d(xh, hh.unsqueeze(3), groups=g, stride=(2, 1))
+        out = torch.cat([xll, xlh, xhl, xhh], dim=1)
+        if rescale:
+            out = out / 2
+        return out
+    def _haar(self, x):
+        for _ in self.range:
+            x = self._dwt(x, rescale=True)
+        return x
+    def _arrange(self, x):
+        x = rearrange(x, "b c (h p1) (w p2) -> b (c p1 p2) h w", p1=self.patch_size, p2=self.patch_size).contiguous()
+        return x
+class Patcher3D(Patcher):
+    """A 3D discrete wavelet transform for video data, expects 5D tensor, i.e. a batch of videos."""
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__(patch_method=patch_method, patch_size=patch_size)
+        self.register_buffer(
+            "patch_size_buffer", patch_size * torch.ones([1], dtype=torch.int32), persistent=_PERSISTENT
+        )
+    def _dwt(self, x, mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets
+        n = h.shape[0]
+        g = x.shape[1]
+        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = (h * ((-1) ** self._arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+        # Handles temporal axis.
+        x = F.pad(x, pad=(max(0, n - 2), n - 1, n - 2, n - 1, n - 2, n - 1), mode=mode).to(dtype)
+        xl = F.conv3d(x, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        xh = F.conv3d(x, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        # Handles spatial axes.
+        xll = F.conv3d(xl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xlh = F.conv3d(xl, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xhl = F.conv3d(xh, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xhh = F.conv3d(xh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xlll = F.conv3d(xll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xllh = F.conv3d(xll, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlhl = F.conv3d(xlh, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlhh = F.conv3d(xlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhll = F.conv3d(xhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhlh = F.conv3d(xhl, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhhl = F.conv3d(xhh, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhhh = F.conv3d(xhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        out = torch.cat([xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh], dim=1)
+        if rescale:
+            out = out / (2 * torch.sqrt(torch.tensor(2.0)))
+        return out
+    def _haar(self, x):
+        xi, xv = torch.split(x, [1, x.shape[2] - 1], dim=2)
+        x = torch.cat([xi.repeat_interleave(self.patch_size, dim=2), xv], dim=2)
+        for _ in self.range:
+            x = self._dwt(x, rescale=True)
+        return x
+    def _arrange(self, x):
+        xi, xv = torch.split(x, [1, x.shape[2] - 1], dim=2)
+        x = torch.cat([xi.repeat_interleave(self.patch_size, dim=2), xv], dim=2)
+        x = rearrange(
+            x,
+            "b c (t p1) (h p2) (w p3) -> b (c p1 p2 p3) t h w",
+            p1=self.patch_size,
+            p2=self.patch_size,
+            p3=self.patch_size,
+        ).contiguous()
+        return x
+class UnPatcher(torch.nn.Module):
+    """A module to convert patches into image tensorsusing torch operations.
+    The main difference from `class Unpatching` is that this module implements
+    all operations using torch, rather than python or numpy, for efficiency purpose.
+    It's bit-wise identical to the Unpatching module outputs, with the added
+    benefit of being torch.jit scriptable.
+    """
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_method = patch_method
+        self.register_buffer("wavelets", _WAVELETS[patch_method], persistent=_PERSISTENT)
+        self.range = range(int(torch.log2(torch.tensor(self.patch_size)).item()))
+        self.register_buffer("_arange", torch.arange(_WAVELETS[patch_method].shape[0]), persistent=_PERSISTENT)
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x):
+        if self.patch_method == "haar":
+            return self._ihaar(x)
+        elif self.patch_method == "rearrange":
+            return self._iarrange(x)
+        else:
+            raise ValueError("Unknown patch method: " + self.patch_method)
+    def _idwt(self, x, rescale=False):
+        dtype = x.dtype
+        h = self.wavelets
+        n = h.shape[0]
+        g = x.shape[1] // 4
+        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
+        hh = (h * ((-1) ** self._arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+        xll, xlh, xhl, xhh = torch.chunk(x.to(dtype), 4, dim=1)
+        # Inverse transform.
+        yl = torch.nn.functional.conv_transpose2d(xll, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0))
+        yl += torch.nn.functional.conv_transpose2d(xlh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0))
+        yh = torch.nn.functional.conv_transpose2d(xhl, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0))
+        yh += torch.nn.functional.conv_transpose2d(xhh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0))
+        y = torch.nn.functional.conv_transpose2d(yl, hl.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2))
+        y += torch.nn.functional.conv_transpose2d(yh, hh.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2))
+        if rescale:
+            y = y * 2
+        return y
+    def _ihaar(self, x):
+        for _ in self.range:
+            x = self._idwt(x, rescale=True)
+        return x
+    def _iarrange(self, x):
+        x = rearrange(x, "b (c p1 p2) h w -> b c (h p1) (w p2)", p1=self.patch_size, p2=self.patch_size)
+        return x
+class UnPatcher3D(UnPatcher):
+    """A 3D inverse discrete wavelet transform for video wavelet decompositions."""
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__(patch_method=patch_method, patch_size=patch_size)
+    def _idwt(self, x, rescale=False):
+        dtype = x.dtype
+        h = self.wavelets
+        g = x.shape[1] // 8  # split into 8 spatio-temporal filtered tesnors.
+        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
+        hh = (h * ((-1) ** self._arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hl = hl.to(dtype=dtype)
+        hh = hh.to(dtype=dtype)
+        xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh = torch.chunk(x, 8, dim=1)
+        # Height height transposed convolutions.
+        xll = F.conv_transpose3d(xlll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xll += F.conv_transpose3d(xllh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlh = F.conv_transpose3d(xlhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlh += F.conv_transpose3d(xlhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhl = F.conv_transpose3d(xhll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhl += F.conv_transpose3d(xhlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhh = F.conv_transpose3d(xhhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhh += F.conv_transpose3d(xhhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        # Handles width transposed convolutions.
+        xl = F.conv_transpose3d(xll, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xl += F.conv_transpose3d(xlh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xh = F.conv_transpose3d(xhl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xh += F.conv_transpose3d(xhh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        # Handles time axis transposed convolutions.
+        x = F.conv_transpose3d(xl, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        x += F.conv_transpose3d(xh, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        if rescale:
+            x = x * (2 * torch.sqrt(torch.tensor(2.0)))
+        return x
+    def _ihaar(self, x):
+        for _ in self.range:
+            x = self._idwt(x, rescale=True)
+        x = x[:, :, self.patch_size - 1 :, ...]
+        return x
+    def _iarrange(self, x):
+        x = rearrange(
+            x,
+            "b (c p1 p2 p3) t h w -> b c (t p1) (h p2) (w p3)",
+            p1=self.patch_size,
+            p2=self.patch_size,
+            p3=self.patch_size,
+        )
+        x = x[:, :, self.patch_size - 1 :, ...]
+        return x

ar_tokenizer_quantizers.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Quantizers for discrete image and video tokenization."""
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from .ar_tokenizer_utils import default, pack_one, round_ste, unpack_one
+class FSQuantizer(nn.Module):
+    """Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
+    Adapted from: https://github.com/lucidrains/vector-quantize-pytorch/blob/9502a1f447876d53fd37685b226bf28f250dc4a3/
+    vector_quantize_pytorch/finite_scalar_quantization.py
+    [Copyright (c) 2020 Phil Wang]
+    https://github.com/lucidrains/vector-quantize-pytorch/blob/9502a1f447876d53fd37685b226bf28f250dc4a3/LICENSE
+    """
+    def __init__(
+        self,
+        levels: list[int],
+        dim: Optional[int] = None,
+        num_codebooks=1,
+        keep_num_codebooks_dim: Optional[bool] = None,
+        scale: Optional[float] = None,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.dtype = ignore_kwargs.get("dtype", torch.float32)
+        _levels = torch.tensor(levels, dtype=torch.int32)
+        self.register_buffer("_levels", _levels, persistent=False)
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=torch.int32)
+        self.register_buffer("_basis", _basis, persistent=False)
+        self.scale = scale
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(dim, len(_levels) * num_codebooks)
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = nn.Linear(self.dim, effective_codebook_dim) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(effective_codebook_dim, self.dim) if has_projections else nn.Identity()
+        self.has_projections = has_projections
+        self.codebook_size = self._levels.prod().item()
+        implicit_codebook = self.indices_to_codes(torch.arange(self.codebook_size), project_out=False)
+        self.register_buffer("implicit_codebook", implicit_codebook, persistent=False)
+    def bound(self, z: torch.Tensor, eps: float = 1e-3) -> torch.Tensor:
+        """Bound `z`, an array of shape (..., d)."""
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z: torch.Tensor) -> torch.Tensor:
+        """Quantizes z, returns quantized zhat, same shape as z."""
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2  # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized: torch.Tensor) -> torch.Tensor:
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat: torch.Tensor) -> torch.Tensor:
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def codes_to_indices(self, zhat: torch.Tensor) -> torch.Tensor:
+        """Converts a `code` to an index in the codebook."""
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat).float()
+        return (zhat * self._basis).sum(dim=-1).to(torch.int32)
+    def indices_to_codes(self, indices: torch.Tensor, project_out=True) -> torch.Tensor:
+        """Inverse of `codes_to_indices`."""
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        indices = rearrange(indices, "... -> ... 1")
+        codes_non_centered = (indices // self._basis) % self._levels
+        codes = self._scale_and_shift_inverse(codes_non_centered)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, "... c d -> ... (c d)")
+        if project_out:
+            codes = self.project_out(codes)
+        if is_img_or_video:
+            codes = rearrange(codes, "b ... d -> b d ...")
+        return codes.to(self.dtype)
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension, which is also log2(codebook size)
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        # standardize image or video into (batch, seq, dimension)
+        if is_img_or_video:
+            z = rearrange(z, "b d ... -> b ... d")
+            z, ps = pack_one(z, "b * d")
+        assert z.shape[-1] == self.dim, f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}"
+        z = self.project_in(z)
+        z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks)
+        codes = self.quantize(z)
+        indices = self.codes_to_indices(codes)
+        codes = rearrange(codes, "b n c d -> b n (c d)")
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if is_img_or_video:
+            out = unpack_one(out, ps, "b * d")
+            out = rearrange(out, "b ... d -> b d ...")
+            indices = unpack_one(indices, ps, "b * c")
+            dummy_loss = torch.zeros_like(out.mean(dim=[1, 2, 3], keepdim=True))
+        else:
+            dummy_loss = torch.zeros_like(out.mean(dim=[1, 2], keepdim=True)).unsqueeze(1)
+        if not self.keep_num_codebooks_dim:
+            indices = rearrange(indices, "... 1 -> ...")
+        return (indices, out.to(self.dtype), dummy_loss)

ar_tokenizer_text_tokenizer.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import AutoTokenizer
+from .log import log
+def get_tokenizer_path(model_family: str, is_instruct_model: bool = False):
+    """
+    Get the tokenizer path from the model family and instruct model flag.
+    Args:
+        model_family (str): The model family.
+        is_instruct_model (bool): Whether the model is an instruct model.
+    Returns:
+        str: The tokenizer path in s3.
+    """
+    model_family = model_family.lower()
+    if model_family == "mistral":
+        return "mistralai/Mistral-Nemo-Instruct-2407"
+    else:
+        assert model_family in ["llama3", "llama3.1"]
+        if model_family == "llama3":
+            model_path = "meta-llama/Meta-Llama-3-8B"
+        elif model_family == "llama3.1":
+            model_path = "meta-llama/Llama-3.1-8B"
+        else:
+            raise ValueError(f"Unsupported model family: {model_family}")
+        suffix = "-Instruct" if is_instruct_model else ""
+        model_path = f"{model_path}{suffix}"
+        return model_path
+class TextTokenizer:
+    """
+    Text tokenizer class built on HuggingFace's Fast Tokenizer (Rust based).
+    """
+    def __init__(
+        self,
+        model_family: str,
+        is_instruct_model: bool,
+        local_path: Optional[str] = None,
+    ):
+        """
+        Initialize the TextTokenizer.
+        Args:
+            model_family (str): The model family.
+            is_instruct_model (bool): Whether the model is an instruct model.
+            local_path (Optional[str]): The local path to the tokenizer. If not provided, the tokenizer will be downloaded from the remote path.
+        """
+        if local_path is None:
+            tokenizer_path = get_tokenizer_path(model_family, is_instruct_model)
+        else:
+            tokenizer_path = local_path
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
+        self.stop_tokens = {
+            self.tokenizer.eos_token_id,
+        }
+        self.model_family = model_family
+        self.is_instruct_model = is_instruct_model
+        self.eos_id = self.tokenizer.eos_token_id
+        if self.tokenizer.pad_token is None:
+            if model_family.startswith("llama"):
+                self.pad_id = 128004  # "<|finetune_right_pad_id|>"
+            elif model_family == "mistral":
+                self.pad_id = 10  # "<pad>"
+            elif model_family == "pixtral":
+                self.pad_id = 11  # "<pad>"
+            else:
+                raise ValueError(f"pad_id not defined for model_family {model_family}")
+        else:
+            self.pad_id = self.tokenizer.pad_token_id
+    def tokenize(self, text: str, *, add_special_tokens: bool = False, **kwargs) -> List[str]:
+        """
+        Converts a string into a sequence of tokens, replacing unknown tokens with the `unk_token`.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            add_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to add the special tokens associated with the corresponding model.
+        Returns:
+            `List[str]`: The list of tokens.
+        """
+        return self.tokenizer.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
+    def encode(
+        self,
+        text: Union[str, List[str], List[int]],
+        *,  # Enforce keyword-only arguments
+        add_special_tokens: bool = True,
+        padding: Union[bool, str] = False,
+        truncation: Union[bool, str] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ) -> List[int]:
+        """
+        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
+        Args:
+            text (`str`, `List[str]` or `List[int]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to add special tokens when encoding the sequences. This will use the underlying
+                `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
+                automatically added to the input ids. This is usefull if you want to add `bos` or `eos` tokens
+                automatically.
+            padding (`bool`, `str`, *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str`, *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+        """
+        return self.tokenizer.encode(
+            text,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            return_tensors=return_tensors,
+        )
+    def decode(
+        self,
+        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor"],
+        *,  # Enforce keyword-only arguments
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces`.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            `str`: The decoded sentence.
+        """
+        return self.tokenizer.decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    def apply_chat_template(
+        self,
+        conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
+        *,
+        add_generation_prompt: bool = False,
+        tokenize: bool = True,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_dict: bool = False,
+        return_assistant_tokens_mask: bool = False,
+        generation_prefix: str = "",
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        """
+        Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
+        ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to determine the format and control tokens to use when converting.
+        More details can be found at https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template
+        Args:
+            conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts
+                with "role" and "content" keys, representing the chat history so far.
+            add_generation_prompt (bool, *optional*):
+                If this is set, a prompt with the token(s) that indicate
+                the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model.
+                Note that this argument will be passed to the chat template, and so it must be supported in the
+                template for this argument to have any effect.
+            continue_final_message (bool, *optional*):
+                If this is set, the chat will be formatted so that the final
+                message in the chat is open-ended, without any EOS tokens. The model will continue this message
+                rather than starting a new one. This allows you to "prefill" part of
+                the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
+            tokenize (`bool`, defaults to `True`):
+                Whether to tokenize the output. If `False`, the output will be a string.
+            padding (`bool`, defaults to `False`):
+                Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`.
+            truncation (`bool`, defaults to `False`):
+                Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
+            max_length (`int`, *optional*):
+                Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
+                not specified, the tokenizer's `max_length` attribute will be used as a default.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
+                values are:
+                - `'tf'`: Return TensorFlow `tf.Tensor` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+            return_dict (`bool`, defaults to `False`):
+                Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
+            generation_prefix (str): Prefix to add before asking model to generate. Helpful to guide the generation. Defaults to "".
+            tokenizer_kwargs (`Dict[str: Any]`, *optional*): Additional kwargs to pass to the tokenizer.
+            return_assistant_tokens_mask (`bool`, defaults to `False`):
+                Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant,
+                the mask will contain 1. For user and system tokens, the mask will contain 0.
+                This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
+            **kwargs: Additional kwargs to pass to the template renderer. Will be accessible by the chat template.
+        Returns:
+            `Union[List[int], Dict]`: A list of token ids representing the tokenized chat so far, including control tokens. This
+            output is ready to pass to the model, either directly or via methods like `generate()`. If `return_dict` is
+            set, will return a dict of tokenizer outputs instead.
+        """
+        if not self.is_instruct_model:
+            raise ValueError(
+                "apply_chat_template is only supported for instruct models. You should pass argument is_instruct_model=True to the TextTokenizer constructor."
+            )
+        # Since generation_prefix is added to the text in the end, ensure that the setting is correct
+        if generation_prefix:
+            assert not tokenize, "tokenize must be False when generation_prefix is provided."
+            assert add_generation_prompt, "add_generation_prompt must be set when generation_prefix is provided."
+        formatted_text: Union[str, List[int]] = self.tokenizer.apply_chat_template(
+            conversation,
+            add_generation_prompt=add_generation_prompt,
+            tokenize=tokenize,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            return_dict=return_dict,
+            return_assistant_tokens_mask=return_assistant_tokens_mask,
+            tokenizer_kwargs=tokenizer_kwargs,
+            **kwargs,
+        )
+        if generation_prefix:
+            formatted_text: str = formatted_text + generation_prefix
+            log.debug(
+                f"Adding generation prefix: {generation_prefix} to the formatted text\n"
+                f"Formatted text: {formatted_text}"
+            )
+        return formatted_text

ar_tokenizer_utils.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+import torch
+from einops import pack, rearrange, unpack
+def time2batch(x: torch.Tensor) -> tuple[torch.Tensor, int]:
+    batch_size = x.shape[0]
+    return rearrange(x, "b c t h w -> (b t) c h w"), batch_size
+def batch2time(x: torch.Tensor, batch_size: int) -> torch.Tensor:
+    return rearrange(x, "(b t) c h w -> b c t h w", b=batch_size)
+def space2batch(x: torch.Tensor) -> tuple[torch.Tensor, int]:
+    batch_size, height = x.shape[0], x.shape[-2]
+    return rearrange(x, "b c t h w -> (b h w) c t"), batch_size, height
+def batch2space(x: torch.Tensor, batch_size: int, height: int) -> torch.Tensor:
+    return rearrange(x, "(b h w) c t -> b c t h w", b=batch_size, h=height)
+def cast_tuple(t: Any, length: int = 1) -> Any:
+    return t if isinstance(t, tuple) else ((t,) * length)
+def replication_pad(x):
+    return torch.cat([x[:, :, :1, ...], x], dim=2)
+def divisible_by(num: int, den: int) -> bool:
+    return (num % den) == 0
+def is_odd(n: int) -> bool:
+    return not divisible_by(n, 2)
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+class CausalNormalize(torch.nn.Module):
+    def __init__(self, in_channels, num_groups=1):
+        super().__init__()
+        self.norm = torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.num_groups = num_groups
+    def forward(self, x):
+        # if num_groups !=1, we apply a spatio-temporal groupnorm for backward compatibility purpose.
+        # All new models should use num_groups=1, otherwise causality is not guaranteed.
+        if self.num_groups == 1:
+            x, batch_size = time2batch(x)
+            return batch2time(self.norm(x), batch_size)
+        return self.norm(x)
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def round_ste(z: torch.Tensor) -> torch.Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+def log(t, eps=1e-5):
+    return t.clamp(min=eps).log()

ar_transformer.py ADDED Viewed

	@@ -0,0 +1,461 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+import torch
+import torch.nn as nn
+from torch.nn.modules.module import _IncompatibleKeys
+from .ar_modules_attention import Attention
+from .ar_modules_embedding import (
+    RotaryPositionEmbeddingPytorchV1,
+    RotaryPositionEmbeddingPytorchV2,
+    SinCosPosEmbAxisTE,
+)
+from .ar_modules_mlp import MLP
+from .ar_modules_normalization import create_norm
+from .checkpoint import process_state_dict, substrings_to_ignore
+from .ar_utils_misc import maybe_convert_to_namespace
+from .log import log
+class TransformerBlock(nn.Module):
+    """
+    A single transformer block consisting of an attention layer and a feed-forward layer.
+    """
+    def __init__(self, layer_id: int, args=None):
+        """
+        Initializes the TransformerBlock module.
+        Args:
+            layer_id: The ID of the transformer block.
+            args: The model arguments containing hyperparameters.
+        """
+        super().__init__()
+        args = maybe_convert_to_namespace(args)
+        attention_args = {
+            "n_heads": args["n_heads"],
+            "n_kv_heads": args["n_kv_heads"],
+            "dim": args["dim"],
+            "context_dim": None,
+            "max_batch_size": args["max_batch_size"],
+            "max_seq_len": args["max_seq_len"],
+            "use_qk_normalization": args["use_qk_normalization"],
+            "causal_mask": args["causal_mask"],
+            "head_dim": args["head_dim"],
+            "fuse_qkv": getattr(args, "fuse_qkv", False),
+            "precision": getattr(args, "precision", "bfloat16"),
+            "attn_type": getattr(args, "attn_type", "self"),
+        }
+        self.attention = Attention(**attention_args)
+        self.has_cross_attention = False
+        self.cross_attention, self.cross_attention_norm = None, None
+        if args["insert_cross_attn"] and layer_id % args["insert_cross_attn_every_k_layers"] == 0:
+            self.has_cross_attention = True
+            cross_attention_args = attention_args.copy()
+            cross_attention_args.update({"context_dim": args["context_dim"], "fuse_qkv": False, "attn_type": "cross"})
+            self.cross_attention = Attention(**cross_attention_args)
+            self.cross_attention_norm = create_norm(args["norm_type"], dim=args["dim"], eps=args["norm_eps"])
+        self.feed_forward = MLP(
+            dim=args["dim"],
+            hidden_dim=args["ffn_hidden_size"],
+        )
+        self.layer_id = layer_id
+        self.attention_norm = create_norm(args["norm_type"], dim=args["dim"], eps=args["norm_eps"])
+        self.ffn_norm = create_norm(args["norm_type"], dim=args["dim"], eps=args["norm_eps"])
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RotaryPositionEmbeddingPytorchV2,
+        input_pos: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        context_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Performs the forward pass of the TransformerBlock module.
+        Args:
+            x: The input tensor.
+            input_pos: The position of the current sequence. Used in inference (with KV cache) only.
+            freqs_cis: The precomputed frequency values for rotary position embeddings.
+            mask: The attention mask tensor.
+            context (Optional[torch.Tensor]): The context tensor added via cross-attn.
+            context_mask (Optional[torch.Tensor]): The context cross-attn mask tensor.
+        Returns:
+            The output tensor after applying the transformer block.
+        """
+        # Apply attention and residual connection
+        h = x + self.attention(self.attention_norm(x), rope=rope, input_pos=input_pos, mask=mask)
+        # If insert cross-attention, apply CA and residual connection
+        if self.has_cross_attention:
+            h = h + self.cross_attention(
+                self.cross_attention_norm(h), rope=rope, input_pos=input_pos, mask=context_mask, context=context
+            )
+        # Apply feed-forward network and residual connection
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+    def init_weights(self):
+        """
+        Initializes the weights of the transformer block.
+        """
+        for norm in (self.attention_norm, self.ffn_norm):
+            norm.reset_parameters()
+        self.attention.init_weights(self.weight_init_std)
+        self.feed_forward.init_weights(self.weight_init_std)
+        if self.has_cross_attention:
+            self.cross_attention_norm.reset_parameters()
+            self.cross_attention.init_weights(self.weight_init_std)
+            # zero-init the final output layer of cross-attention
+            # nn.init.zeros_(self.cross_attention.wo.weight)
+class Transformer(nn.Module):
+    """
+    The Transformer network consisting of transformer blocks.
+    """
+    def __init__(self, params, tokenizer_config=None, init_weights: bool = True):
+        """
+        Initializes the Transformer module.
+        Args:
+            params: The model parameters containing hyperparameters.
+            tokenizer_config: The model tokenizer configuration.
+            init_weights (bool): Whether to initialize the weights of the transformer following
+                TorchTitan's Llama3 initialization scheme.
+        """
+        super().__init__()
+        # Check if self.params is an OmegaConf DictConfig instance
+        self.params = maybe_convert_to_namespace(params)
+        self.vocab_size = params["vocab_size"]
+        self.n_layers = params["n_layers"]
+        self.precision = getattr(torch, params["precision"])
+        self.tokenizer_config = tokenizer_config
+        self.num_video_frames = params["num_video_frames"]
+        # Token embeddings
+        self.tok_embeddings = self._create_token_embeddings()
+        self.rope_config = self._create_rope_config()
+        # Transformer layers
+        self.layers = nn.ModuleList(
+            [TransformerBlock(layer_id, self.params).to(self.precision) for layer_id in range(self.n_layers)]
+        )
+        # Final layer normalization
+        self.norm = create_norm(self.params["norm_type"], dim=self.params["dim"], eps=self.params["norm_eps"]).to(
+            self.precision
+        )
+        if self.params["pytorch_rope_version"] == "v1":
+            self.rope = RotaryPositionEmbeddingPytorchV1(**self.rope_config)
+        elif self.params["pytorch_rope_version"] == "v2":
+            # Rotary position embeddings
+            training_type = self.tokenizer_config.training_type if self.tokenizer_config is not None else None
+            self.rope = RotaryPositionEmbeddingPytorchV2(
+                seq_len=self.params["max_seq_len"], training_type=training_type, **self.rope_config
+            )
+        else:
+            raise ValueError(f"Invalid PyTorch RoPE version: {self.params['pytorch_rope_version']}")
+        # Causal mask
+        self.causal_mask = torch.tril(
+            torch.ones(self.params["max_seq_len"], self.params["max_seq_len"], dtype=torch.bool)
+        ).cuda()
+        # Output projection
+        self.output = self._create_output_projection()
+        # Freeze network parameters for finetuning w/ cross-attention
+        self.has_cross_attention = getattr(params, "insert_cross_attn", False)
+        # Absolute position embeddings
+        if self.params["apply_abs_pos_emb"]:
+            self.pos_emb_config = self._create_abs_pos_emb_config()
+            self.pos_emb, self.abs_pos_emb = self._initialize_abs_pos_emb()
+    def _create_rope_config(self) -> Dict:
+        shape_map = {
+            "3D": self.params["video_latent_shape"],
+            "1D": None,
+        }
+        latent_shape = shape_map.get(self.params["rope_dim"], None)
+        head_dim = self.params["head_dim"]
+        if head_dim is None:
+            head_dim = self.params["dim"] // self.params["n_heads"]
+        return {
+            "dim": head_dim,
+            "max_position_embeddings": self.params["max_seq_len"],
+            "original_max_position_embeddings": self.params["original_seq_len"],
+            "rope_theta": self.params["rope_theta"],
+            "apply_yarn": self.params["apply_yarn"],
+            "scale": self.params["yarn_scale"],
+            "beta_fast": self.params["yarn_beta_fast"],
+            "beta_slow": self.params["yarn_beta_slow"],
+            "rope_dim": self.params["rope_dim"],
+            "latent_shape": latent_shape,
+            "original_latent_shape": self.params["original_latent_shape"],
+            "pad_to_multiple_of": self.params["pad_to_multiple_of"],
+        }
+    def _create_abs_pos_emb_config(self):
+        shape_map = {
+            "3D": self.params["video_latent_shape"],
+            "1D": None,
+        }
+        latent_shape = shape_map.get(self.params["rope_dim"], None)
+        return {
+            "dim": self.params["dim"],
+            "latent_shape": latent_shape,
+            "pad_to_multiple_of": self.params["pad_to_multiple_of"],
+        }
+    def _create_token_embeddings(self, vocab_size: int = None):
+        """
+        Create token embeddings.
+        Returns:
+            nn.Module: Token embeddings module.
+        """
+        if vocab_size is None:
+            vocab_size = self.params["vocab_size"]
+        return nn.Embedding(vocab_size, self.params["dim"]).to(self.precision)
+    def _create_output_projection(self, vocab_size: int = None):
+        """
+        Create the output projection layer.
+        Args:
+            vocab_size (int): Vocabulary size (to override the default vocab size).
+        Returns:
+            LinearTE: Output projection layer.
+        """
+        if vocab_size is None:
+            vocab_size = self.params["vocab_size"]
+        return nn.Linear(self.params["dim"], vocab_size, bias=False).to(self.precision)
+    def _initialize_abs_pos_emb(self):
+        pos_emb = SinCosPosEmbAxisTE(**self.pos_emb_config)
+        training_type = self.tokenizer_config.training_type if self.tokenizer_config is not None else None
+        abs_pos_emb = pos_emb.forward(training_type=training_type)
+        return pos_emb, abs_pos_emb
+    def forward(
+        self,
+        tokens: Optional[torch.Tensor] = None,
+        input_pos: Optional[torch.Tensor] = None,
+        token_embeddings: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        context_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Performs the forward pass of the Transformer module.
+        Args:
+            tokens (torch.Tensor, optional): The input tensor of token IDs.
+            input_pos (Optional[torch.Tensor]): The position of the current sequence. Used in inference with KV cache.
+            token_embeddings (torch.Tensor, optional): Precomputed token embeddings. If provided, tokens should be None.
+            context (Optional[torch.Tensor]): The context tensor added via cross-attn.
+            context_mask (Optional[torch.Tensor]): The context cross-attn mask tensor.
+        Returns:
+            The output tensor after applying the transformer layers.
+        """
+        # Token embeddings
+        assert (
+            tokens is None or token_embeddings is None
+        ), "Either tokens or token_embeddings should be provided, not both."
+        if token_embeddings is None:
+            seq_len = tokens.shape[1]
+            h = self.tok_embeddings(tokens)
+        else:
+            seq_len = token_embeddings.shape[1]
+            h = token_embeddings
+        # Create attention mask
+        mask = self._create_attention_mask(input_pos=input_pos)
+        # Prepare layer arguments
+        layer_kwargs = self._prepare_layer_kwargs(
+            input_pos=input_pos,
+            mask=mask,
+            context=context,
+            context_mask=context_mask,
+        )
+        # Apply transformer layers
+        for layer in self.layers:
+            if self.params["apply_abs_pos_emb"]:
+                h = self.apply_abs_pos_emb(h, input_pos=input_pos)
+            h = layer(h, **layer_kwargs)
+        # Apply final layer normalization
+        h = self.norm(h)
+        # Output linear projection
+        output = self.output(h)
+        return output
+    def _create_attention_mask(self, input_pos: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+        """
+        Creates an attention mask for the transformer layers.
+        Args:
+            input_pos[torch.Tensor]: The position of input sequence (used for inference only).
+        Returns:
+            Optional[torch.Tensor]: The attention mask, or None for causal mask.
+        """
+        assert input_pos is not None, "input_pos must be provided for inference"
+        mask = self.causal_mask[input_pos]
+        return mask
+    def _prepare_layer_kwargs(
+        self,
+        input_pos: Optional[torch.Tensor],
+        mask: Optional[torch.Tensor],
+        context: Optional[torch.Tensor],
+        context_mask: Optional[torch.Tensor],
+    ) -> Dict[str, Any]:
+        """
+        Prepares the keyword arguments for transformer layers.
+        Args:
+            input_pos (Optional[torch.Tensor]): The position of the current sequence.
+            mask (Optional[torch.Tensor]): The attention mask.
+            context (Optional[torch.Tensor]): The context tensor added via cross-attn.
+            context_mask (Optional[torch.Tensor]): The context cross-attn mask tensor.
+        Returns:
+            Dict[str, Any]: A dictionary of keyword arguments for the transformer layers.
+        """
+        if context is not None:
+            context = context.to(self.precision)
+        if isinstance(mask, torch.Tensor) and mask.ndim == 2:
+            mask = mask[None, None, :, :]
+        if isinstance(context_mask, torch.Tensor) and context_mask.ndim == 2:
+            context_mask = context_mask[None, None, :, :]
+        layer_kwargs = {
+            "mask": mask,
+            "context": context,
+            "context_mask": context_mask,
+        }
+        layer_kwargs["input_pos"] = input_pos
+        layer_kwargs["rope"] = self.rope
+        return layer_kwargs
+    def apply_abs_pos_emb(self, x: torch.Tensor, input_pos: int = None) -> torch.Tensor:
+        """
+        Applies the absolute position embeddings to the input tensor.
+        """
+        abs_pos_emb = self.abs_pos_emb
+        abs_pos_emb = abs_pos_emb[:, input_pos, :] if input_pos is not None else abs_pos_emb
+        return x + abs_pos_emb
+    @torch.no_grad()
+    def expand_vocab(
+        self, new_vocab_size: int, init_method: str = "gaussian", multiple_of=64, expand_output_layer=True
+    ):
+        """
+        Expands the vocabulary of the model to the new size.
+        Args:
+            new_vocab_size (int): The new vocabulary size.
+            init_method (str): The initialization method for new embeddings.
+                               Can be "zero" or "gaussian". Default is "gaussian".
+            multiple_of (int): The new vocabulary size must be a multiple of this value. Defaults to 64 to fully
+                leverage the power of NVIDIA TensorCore (source 1: https://x.com/karpathy/status/1621578354024677377,
+                source 2: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc)
+            expand_output_layer (bool): Whether to also expand the output layer. Defaults to True.
+        Returns:
+            None
+        """
+        if new_vocab_size <= self.vocab_size:
+            raise ValueError(
+                f"New vocabulary size ({new_vocab_size}) must be " f"larger than current size ({self.vocab_size})"
+            )
+        if new_vocab_size % multiple_of != 0:
+            log.debug(f"New vocabulary size must be a multiple of {multiple_of}. Obtained {new_vocab_size}.")
+            new_vocab_size = (new_vocab_size // multiple_of + 1) * multiple_of
+            log.debug(f"Rounded vocabulary size to {new_vocab_size}.")
+        # Resize token embeddings
+        old_embeddings = self.tok_embeddings
+        tensor_kwargs = {"device": old_embeddings.weight.device, "dtype": old_embeddings.weight.dtype}
+        self.tok_embeddings = self._create_token_embeddings(vocab_size=new_vocab_size).to(**tensor_kwargs)
+        # Initialize new embeddings
+        if init_method not in ["zero", "gaussian"]:
+            raise ValueError(f"Unknown initialization method: {init_method}")
+        # The default initialization of nn.Embedding is Gaussian, so we don't need to do anything
+        # if init_method == "gaussian". Only if init_method == "zero", we need to zero out the new embeddings.
+        if init_method == "zero":
+            self.tok_embeddings.weight.data[self.vocab_size :].zero_()
+        # Copy old embeddings
+        log.debug(
+            f"old_embeddings: {old_embeddings.weight.data.shape}, new_embeddings: {self.tok_embeddings.weight.data.shape}, vocab_size: {self.vocab_size}"
+        )
+        self.tok_embeddings.weight.data[: self.vocab_size] = old_embeddings.weight.data
+        # Resize output layer
+        old_output = self.output
+        self.output = self._create_output_projection(vocab_size=new_vocab_size if expand_output_layer else None)
+        # Initialize new output weights
+        self.output.weight.data[self.vocab_size :].zero_()
+        # Copy old output weights
+        self.output.weight.data[: self.vocab_size] = old_output.weight.data
+        # Update vocab size
+        self.vocab_size = new_vocab_size
+        log.debug(f"Expanded vocabulary size to {new_vocab_size}")
+    def state_dict(self, *args, **kwargs):
+        """
+        Process the state dict (e.g., remove "_extra_state" keys imposed by TransformerEngine for FP8).
+        """
+        state_dict = super().state_dict(*args, **kwargs)
+        return process_state_dict(state_dict)
+    def load_state_dict(self, state_dict: Dict[str, Any], strict: bool = True, assign: bool = False):
+        """
+        Ignore the missing keys with substrings matching `substring_to_ignore` (e.g., "_extra_state" keys imposed by
+        TransformerEngine for FP8).
+        """
+        state_dict = process_state_dict(state_dict)
+        missing_keys, unexpected_keys = super().load_state_dict(state_dict, strict=False, assign=assign)
+        if strict:
+            actual_missing_keys = []
+            for key in missing_keys:
+                if not any(substring in key for substring in substrings_to_ignore):
+                    actual_missing_keys.append(key)
+            if len(actual_missing_keys) > 0 or len(unexpected_keys) > 0:
+                raise ValueError(f"Missing keys: {actual_missing_keys}\n\nUnexpected keys: {unexpected_keys}")
+            missing_keys = actual_missing_keys
+        return _IncompatibleKeys(missing_keys, unexpected_keys)

ar_utils_misc.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from omegaconf import DictConfig, OmegaConf
+class CustomSimpleNamespace:
+    """
+    A simple namespace class that supports both attribute-style and dictionary-style access.
+    """
+    def __init__(self, d):
+        self._d = d
+    def __getattr__(self, attr):
+        # Attribute-style access: config.key
+        try:
+            return self._d[attr]
+        except KeyError:
+            raise AttributeError(f"'CustomSimpleNamespace' object has no attribute '{attr}'")
+    def __getitem__(self, key):
+        # Dictionary-style access: config['key']
+        return self._d[key]
+def maybe_convert_to_namespace(config):
+    """
+    This function cast a OmegaConf's DictConfig or a standard dict to CustomSimpleNamespace, which supports both
+    attribute-style and dictionary-style access.
+    Note: We need to convert OmegaConf's DictConfig since it is not compatible with torch.compile.
+    """
+    # If input is OmegaConf's DictConfig, convert to a standard dict
+    if isinstance(config, DictConfig):
+        config = OmegaConf.to_container(config, resolve=True)
+    if isinstance(config, dict):
+        return CustomSimpleNamespace(config)
+    else:
+        return config

attention.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional
+import numpy as np
+import torch
+import transformer_engine as te
+from einops import rearrange
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from transformer_engine.pytorch.attention import DotProductAttention, apply_rotary_pos_emb
+# ---------------------- Feed Forward Network -----------------------
+class FeedForward(nn.Module):
+    """
+    Transformer FFN with optional gating
+    Parameters:
+        d_model (int): Dimensionality of input features.
+        d_ff (int): Dimensionality of the hidden layer.
+        dropout (float, optional): Dropout rate applied after the activation function. Defaults to 0.1.
+        activation (callable, optional): The activation function applied after the first linear layer.
+                                         Defaults to nn.ReLU().
+        is_gated (bool, optional): If set to True, incorporates gating mechanism to the feed-forward layer.
+                                   Defaults to False.
+        bias (bool, optional): If set to True, adds a bias to the linear layers. Defaults to True.
+    Example:
+        >>> ff = FeedForward(d_model=512, d_ff=2048)
+        >>> x = torch.randn(64, 10, 512)  # Example input tensor
+        >>> output = ff(x)
+        >>> print(output.shape)  # Expected shape: (64, 10, 512)
+    """
+    def __init__(
+        self,
+        d_model: int,
+        d_ff: int,
+        dropout: float = 0.1,
+        activation=nn.ReLU(),
+        is_gated: bool = False,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.layer1 = nn.Linear(d_model, d_ff, bias=bias)
+        self.layer2 = nn.Linear(d_ff, d_model, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = activation
+        self.is_gated = is_gated
+        if is_gated:
+            self.linear_gate = nn.Linear(d_model, d_ff, bias=False)
+    def forward(self, x: torch.Tensor):
+        g = self.activation(self.layer1(x))
+        if self.is_gated:
+            x = g * self.linear_gate(x)
+        else:
+            x = g
+        assert self.dropout.p == 0.0, "we skip dropout"
+        return self.layer2(x)
+class GPT2FeedForward(FeedForward):
+    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1, bias: bool = False):
+        super().__init__(
+            d_model=d_model,
+            d_ff=d_ff,
+            dropout=dropout,
+            activation=nn.GELU(),
+            is_gated=False,
+            bias=bias,
+        )
+    def forward(self, x: torch.Tensor):
+        assert self.dropout.p == 0.0, "we skip dropout"
+        x = self.layer1(x)
+        def activation_layer2_forward(x):
+            x = self.activation(x)
+            x = self.layer2(x)
+            return x
+        x = checkpoint(activation_layer2_forward, x, use_reentrant=False)
+        return x
+# ---------------------- Normalization Layer -----------------------
+def normalize(x: torch.Tensor, dim: Optional[List[int]] = None, eps: float = 0) -> torch.Tensor:
+    """
+    Normalizes the input tensor along specified dimensions such that the average square norm of elements is adjusted.
+    Args:
+        x (torch.Tensor): The input tensor to normalize.
+        dim (list, optional): The dimensions over which to normalize. If None, normalizes over all dimensions except the first.
+        eps (float, optional): A small constant to ensure numerical stability during division.
+    Returns:
+        torch.Tensor: The normalized tensor.
+    """
+    if dim is None:
+        dim = list(range(1, x.ndim))
+    norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32)
+    norm = torch.add(eps, norm, alpha=np.sqrt(norm.numel() / x.numel()))
+    return x / norm.to(x.dtype)
+def get_normalization(name: str, channels: int):
+    if name == "I":
+        return nn.Identity()
+    elif name == "R":
+        return te.pytorch.RMSNorm(channels, eps=1e-6)
+    else:
+        raise ValueError(f"Normalization {name} not found")
+class BaseAttentionOp(nn.Module):
+    def __init__(self):
+        super().__init__()
+class Attention(nn.Module):
+    """
+    Generalized attention impl.
+    Allowing for both self-attention and cross-attention configurations depending on whether a `context_dim` is provided.
+    If `context_dim` is None, self-attention is assumed.
+    Parameters:
+        query_dim (int): Dimension of each query vector.
+        context_dim (int, optional): Dimension of each context vector. If None, self-attention is assumed.
+        heads (int, optional): Number of attention heads. Defaults to 8.
+        dim_head (int, optional): Dimension of each head. Defaults to 64.
+        dropout (float, optional): Dropout rate applied to the output of the attention block. Defaults to 0.0.
+        attn_op (BaseAttentionOp, optional): Custom attention operation to be used instead of the default.
+        qkv_bias (bool, optional): If True, adds a learnable bias to query, key, and value projections. Defaults to False.
+        out_bias (bool, optional): If True, adds a learnable bias to the output projection. Defaults to False.
+        qkv_norm (str, optional): A string representing normalization strategies for query, key, and value projections.
+                                  Defaults to "SSI".
+        qkv_norm_mode (str, optional): A string representing normalization mode for query, key, and value projections.
+                                        Defaults to 'per_head'. Only support 'per_head'.
+    Examples:
+        >>> attn = Attention(query_dim=128, context_dim=256, heads=4, dim_head=32, dropout=0.1)
+        >>> query = torch.randn(10, 128)  # Batch size of 10
+        >>> context = torch.randn(10, 256)  # Batch size of 10
+        >>> output = attn(query, context)  # Perform the attention operation
+    Note:
+        https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        attn_op: Optional[BaseAttentionOp] = None,
+        qkv_bias: bool = False,
+        out_bias: bool = False,
+        qkv_norm: str = "SSI",
+        qkv_norm_mode: str = "per_head",
+        backend: str = "transformer_engine",
+        qkv_format: str = "bshd",
+    ) -> None:
+        super().__init__()
+        self.is_selfattn = context_dim is None  # self attention
+        inner_dim = dim_head * heads
+        context_dim = query_dim if context_dim is None else context_dim
+        self.heads = heads
+        self.dim_head = dim_head
+        self.qkv_norm_mode = qkv_norm_mode
+        self.qkv_format = qkv_format
+        if self.qkv_norm_mode == "per_head":
+            norm_dim = dim_head
+        else:
+            raise ValueError(f"Normalization mode {self.qkv_norm_mode} not found, only support 'per_head'")
+        self.backend = backend
+        self.to_q = nn.Sequential(
+            nn.Linear(query_dim, inner_dim, bias=qkv_bias),
+            get_normalization(qkv_norm[0], norm_dim),
+        )
+        self.to_k = nn.Sequential(
+            nn.Linear(context_dim, inner_dim, bias=qkv_bias),
+            get_normalization(qkv_norm[1], norm_dim),
+        )
+        self.to_v = nn.Sequential(
+            nn.Linear(context_dim, inner_dim, bias=qkv_bias),
+            get_normalization(qkv_norm[2], norm_dim),
+        )
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim, bias=out_bias),
+            nn.Dropout(dropout),
+        )
+        if attn_op:  # use what is given
+            self.attn_op = attn_op
+        elif self.backend == "transformer_engine":
+            sequence_parallel = False
+            self.attn_op: BaseAttentionOp = DotProductAttention(
+                self.heads,
+                self.dim_head,
+                num_gqa_groups=self.heads,
+                attention_dropout=0,
+                qkv_format=qkv_format,
+                attn_mask_type="no_mask",
+                tp_size=1,
+                tp_group=None,
+                sequence_parallel=sequence_parallel,
+            )
+        else:
+            raise ValueError(f"Backend {backend} not found")
+    def cal_qkv(
+        self, x, context=None, mask=None, rope_emb=None, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        del kwargs
+        """
+        self.to_q, self.to_k, self.to_v are nn.Sequential with projection + normalization layers.
+        Before 07/24/2024, these modules normalize across all heads.
+        After 07/24/2024, to support tensor parallelism and follow the common practice in the community,
+        we support to normalize per head.
+        To keep the checkpoint copatibility with the previous code,
+        we keep the nn.Sequential but call the projection and the normalization layers separately.
+        We use a flag `self.qkv_norm_mode` to control the normalization behavior.
+        The default value of `self.qkv_norm_mode` is "per_head", which means we normalize per head.
+        """
+        if self.qkv_norm_mode == "per_head":
+            q = self.to_q[0](x)
+            context = x if context is None else context
+            k = self.to_k[0](context)
+            v = self.to_v[0](context)
+            q, k, v = map(
+                lambda t: rearrange(t, "b ... (n c) -> b ... n c", n=self.heads, c=self.dim_head),
+                (q, k, v),
+            )
+        else:
+            raise ValueError(f"Normalization mode {self.qkv_norm_mode} not found, only support 'per_head'")
+        q = self.to_q[1](q)
+        k = self.to_k[1](k)
+        v = self.to_v[1](v)
+        if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
+            q = apply_rotary_pos_emb(q, rope_emb, tensor_format=self.qkv_format, fused=True)
+            k = apply_rotary_pos_emb(k, rope_emb, tensor_format=self.qkv_format, fused=True)
+        return q, k, v
+    def cal_attn(self, q, k, v, mask=None):
+        if self.backend == "transformer_engine":
+            seq_dim = self.qkv_format.index("s")
+            assert (
+                q.shape[seq_dim] > 1 and k.shape[seq_dim] > 1
+            ), "Seqlen must be larger than 1 for TE Attention starting with 1.8 TE version."
+            out = self.attn_op(q, k, v, core_attention_bias_type="no_bias", core_attention_bias=None)  # [B, Mq, H, V]
+            return self.to_out(out)
+        elif self.backend == "torch":
+            out = self.attn_op(q, k, v, mask=mask)  # [B, Mq, H, V]
+            return self.to_out(rearrange(out, " b ... n c -> b ... (n c)"))
+        else:
+            raise ValueError(f"Backend {self.backend} not found")
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        rope_emb=None,
+        **kwargs,
+    ):
+        """
+        Args:
+            x (Tensor): The query tensor of shape [B, Mq, K]
+            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
+        """
+        q, k, v = self.cal_qkv(x, context, mask, rope_emb=rope_emb, **kwargs)
+        return self.cal_attn(q, k, v, mask)

base_world_generation_pipeline.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import os
+from abc import ABC
+from typing import Any
+import numpy as np
+import torch
+from .t5_text_encoder import CosmosT5TextEncoder
+from .presets import presets as guardrail_presets
+class BaseWorldGenerationPipeline(ABC):
+    def __init__(
+        self,
+        inference_type: str | None = None,
+        checkpoint_dir: str | None = None,
+        checkpoint_name: str | None = None,
+        enable_text_guardrail: bool = False,
+        enable_video_guardrail: bool = False,
+        offload_network: bool = False,
+        offload_tokenizer: bool = False,
+        offload_text_encoder_model: bool = False,
+        offload_guardrail_models: bool = False,
+    ):
+        """Initialize base world generation pipeline.
+        This abstract base class provides core functionality for world generation models including:
+        - Model loading and initialization
+        - Text encoding and embedding
+        - Safety checks and content filtering
+        - Memory management through model offloading
+        Args:
+            inference_type: The type of inference pipeline ("text2world" or "video2world")
+            checkpoint_dir: Root directory containing model checkpoints
+            checkpoint_name: Name of the specific checkpoint file to load
+            enable_text_guardrail: If True, validates input prompts for safety
+            enable_video_guardrail: If True, validates generated videos for safety
+            offload_network: If True, moves main model to CPU after inference
+            offload_tokenizer: If True, moves tokenizer to CPU after use
+            offload_text_encoder_model: If True, moves T5 encoder to CPU after encoding
+            offload_guardrail_models: If True, moves safety models to CPU after checks
+        """
+        self.inference_type = inference_type
+        self.checkpoint_dir = checkpoint_dir
+        self.checkpoint_name = checkpoint_name
+        self.guardrail_dir = "Cosmos-1.0-Guardrail"
+        self.enable_text_guardrail = enable_text_guardrail
+        self.enable_video_guardrail = enable_video_guardrail
+        # Add offloading flags
+        self.offload_network = offload_network
+        self.offload_tokenizer = offload_tokenizer
+        self.offload_text_encoder_model = offload_text_encoder_model
+        self.offload_guardrail_models = offload_guardrail_models
+        # Initialize model instances
+        self.text_guardrail = None
+        self.video_guardrail = None
+        self.text_encoder = None
+        self.model = None
+        self._load_model()
+        if not self.offload_text_encoder_model:
+            self._load_text_encoder_model()
+        if not self.offload_guardrail_models:
+            if self.enable_text_guardrail:
+                self._load_text_guardrail()
+            if self.enable_video_guardrail:
+                self._load_video_guardrail()
+        if not self.offload_network:
+            self._load_network()
+        if not self.offload_tokenizer:
+            self._load_tokenizer()
+    def _load_tokenizer(self):
+        pass
+    def _load_network(self):
+        pass
+    def _load_model(self, checkpoint_name: str) -> Any:
+        """Load the world generation model from a checkpoint.
+        This abstract method must be implemented by subclasses to load their specific
+        model architecture and weights.
+        Args:
+            checkpoint_name: Path to the model checkpoint file
+        Returns:
+            The loaded model instance
+        Raises:
+            NotImplementedError: Must be implemented by subclasses
+        """
+        pass
+    def _load_text_encoder_model(self):
+        """Load the T5 text encoder model.
+        Initializes and loads the T5 encoder model used for converting text prompts
+        into embeddings that condition the world generation model.
+        Returns:
+            Loaded T5 text encoder model instance
+        """
+        self.text_encoder = CosmosT5TextEncoder(cache_dir=self.checkpoint_dir)
+    def _load_text_guardrail(self):
+        """Load text safety classifier models.
+        Initializes models used for checking input prompts against safety policies.
+        Models are loaded from the specified guardrail directory.
+        """
+        self.text_guardrail = guardrail_presets.create_text_guardrail_runner(
+            checkpoint_dir=os.path.join(self.checkpoint_dir, self.guardrail_dir)
+        )
+    def _load_video_guardrail(self):
+        """Load video safety classifier models.
+        Initializes models used for validating generated video content against
+        safety policies. Models are loaded from the specified guardrail directory.
+        """
+        self.video_guardrail = guardrail_presets.create_video_guardrail_runner(
+            checkpoint_dir=os.path.join(self.checkpoint_dir, self.guardrail_dir)
+        )
+    def _offload_network(self):
+        if self.model.model:
+            del self.model.model
+            self.model.model = None
+            gc.collect()
+            torch.cuda.empty_cache()
+    def _offload_tokenizer(self):
+        if self.model.tokenizer:
+            del self.model.tokenizer
+            self.model.tokenizer = None
+            gc.collect()
+            torch.cuda.empty_cache()
+    def _offload_guardrail_models(self):
+        """Offload safety classifier models to reduce memory usage.
+        Moves safety models to CPU and clears GPU memory if they are no longer needed.
+        This helps manage memory when processing multiple inputs sequentially.
+        """
+        if self.text_guardrail:
+            del self.text_guardrail
+            self.text_guardrail = None
+        if self.video_guardrail:
+            del self.video_guardrail
+            self.video_guardrail = None
+        gc.collect()
+        torch.cuda.empty_cache()
+    def _offload_text_encoder_model(self):
+        """Offload T5 text encoder to reduce memory usage.
+        Moves the T5 encoder to CPU and clears GPU memory after text encoding is complete.
+        This helps manage memory when processing multiple inputs sequentially.
+        """
+        if self.text_encoder:
+            del self.text_encoder
+            self.text_encoder = None
+            gc.collect()
+            torch.cuda.empty_cache()
+    def _run_model(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        """Generate world latents using the model.
+        This abstract method must be implemented by subclasses to define their specific
+        generation process.
+        Args:
+            *args: Variable positional arguments for model inference
+            **kwargs: Variable keyword arguments for model inference
+        Returns:
+            torch.Tensor: Generated world representation tensor
+        """
+        pass
+    def _run_model_with_offload(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        """Generate world representation with memory management.
+        Handles loading the model before inference and offloading afterward if enabled.
+        This helps minimize GPU memory usage during inference.
+        Args:
+            *args: Arguments passed to _run_model
+            **kwargs: Keyword arguments passed to _run_model
+        Returns:
+            np.ndarray: Generated world representation as numpy array
+        """
+        pass
+    def _run_guardrail_on_prompt(self, prompt: str) -> bool:
+        """Check if prompt meets safety requirements.
+        Validates the input prompt against safety policies using loaded guardrail models.
+        Args:
+            prompt: Raw text prompt to validate
+        Returns:
+            bool: True if prompt passes all safety checks, False otherwise
+        """
+        return guardrail_presets.run_text_guardrail(prompt, self.text_guardrail)
+    def _run_guardrail_on_prompt_with_offload(self, prompt: str) -> bool:
+        """Check prompt safety with memory management.
+        Validates prompt safety while handling model loading/offloading to manage memory.
+        Args:
+            prompt: Raw text prompt to validate
+        Returns:
+            bool: True if prompt passes all safety checks, False otherwise
+        """
+        if self.offload_guardrail_models:
+            self._load_text_guardrail()
+        is_safe = self._run_guardrail_on_prompt(prompt)
+        if self.offload_guardrail_models:
+            self._offload_guardrail_models()
+        return is_safe
+    def _run_guardrail_on_video(self, video: np.ndarray) -> np.ndarray | None:
+        """Check if video meets safety requirements.
+        Validates generated video content against safety policies using guardrail models.
+        Args:
+            video: Video frames to validate
+        Returns:
+            np.ndarray: Processed video if safe, None if unsafe
+        """
+        return guardrail_presets.run_video_guardrail(video, self.video_guardrail)
+    def _run_guardrail_on_video_with_offload(self, video: np.ndarray) -> np.ndarray | None:
+        """Check if generated video meets safety requirements.
+        Args:
+            video: Video frames to validate
+        Returns:
+            np.ndarray: Processed video frames if safe, None otherwise
+        Note:
+            Guardrail models are offloaded after checks if enabled.
+        """
+        if self.offload_guardrail_models:
+            self._load_video_guardrail()
+        video = self._run_guardrail_on_video(video)
+        if self.offload_guardrail_models:
+            self._offload_guardrail_models()
+        return video
+    def _run_text_embedding_on_prompt(
+        self, prompts: list[str], **kwargs: Any
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """Convert text prompts to embeddings.
+        Processes text prompts into embedding tensors that condition the generation model.
+        Args:
+            prompts: List of text prompts to encode
+            **kwargs: Additional arguments for text encoding
+        Returns:
+            tuple containing:
+                - List of text embedding tensors for each prompt
+                - List of attention masks for each embedding
+        """
+        embeddings = []
+        masks = []
+        for prompt in prompts:
+            embedding, mask = self.text_encoder.encode_prompts(
+                [prompt],
+                **kwargs,
+            )
+            embeddings.append(embedding)
+            masks.append(mask)
+        return embeddings, masks
+    def _run_text_embedding_on_prompt_with_offload(
+        self, prompts: list[str], **kwargs: Any
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """Convert text prompt into embeddings using T5 encoder.
+        Args:
+            prompt: Processed and validated text prompt
+        Returns:
+            Text embedding tensor to condition diffusion model
+        Note:
+            T5 model is offloaded after encoding if enabled.
+        """
+        if self.offload_text_encoder_model:
+            self._load_text_encoder_model()
+        embeddings, masks = self._run_text_embedding_on_prompt(prompts, **kwargs)
+        if self.offload_text_encoder_model:
+            self._offload_text_encoder_model()
+        return embeddings, masks
+    def _run_tokenizer_decoding(self, samples: torch.Tensor) -> np.ndarray:
+        """Decode model outputs into final world representation.
+        This abstract method must be implemented by subclasses to convert raw model
+        outputs into their specific world representation format.
+        Args:
+            samples: Raw output tensor from the generation model
+        Returns:
+            np.ndarray: Decoded world representation
+        """
+        pass
+    def generate(self, *args: Any, **kwargs: Any):
+        """Generate world representation.
+        This abstract method must be implemented by subclasses to convert raw model
+        outputs into their specific world representation format.
+        Args:
+            *args: Variable positional arguments for model inference
+            **kwargs: Variable keyword arguments for model inference
+        """
+        pass

batch_ops.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Functions for performing operations with broadcasting to the right axis
+#
+# Example
+# input1: tensor of size (N1, N2)
+# input2: tensor of size (N1, N2, N3, N4)
+# batch_mul(input1, input2) = input1[:, :, None, None] * input2
+#
+# If the common dimensions don't match, we raise an assertion error.
+from torch import Tensor
+def common_broadcast(x: Tensor, y: Tensor) -> tuple[Tensor, Tensor]:
+    ndims1 = x.ndim
+    ndims2 = y.ndim
+    common_ndims = min(ndims1, ndims2)
+    for axis in range(common_ndims):
+        assert x.shape[axis] == y.shape[axis], "Dimensions not equal at axis {}".format(axis)
+    if ndims1 < ndims2:
+        x = x.reshape(x.shape + (1,) * (ndims2 - ndims1))
+    elif ndims2 < ndims1:
+        y = y.reshape(y.shape + (1,) * (ndims1 - ndims2))
+    return x, y
+def batch_mul(x: Tensor, y: Tensor) -> Tensor:
+    x, y = common_broadcast(x, y)
+    return x * y

blocklist.py ADDED Viewed

	@@ -0,0 +1,219 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+import string
+from difflib import SequenceMatcher
+from .log import log
+import nltk
+from better_profanity import profanity
+from .guardrail_blocklist_utils import read_keyword_list_from_dir, to_ascii
+from .guardrail_core import ContentSafetyGuardrail, GuardrailRunner
+from .misc import misc, Color, timer
+DEFAULT_CHECKPOINT_DIR = "checkpoints/Cosmos-1.0-Guardrail/blocklist"
+CENSOR = Color.red("*")
+class Blocklist(ContentSafetyGuardrail):
+    def __init__(
+        self,
+        checkpoint_dir: str = DEFAULT_CHECKPOINT_DIR,
+        guardrail_partial_match_min_chars: int = 4,
+        guardrail_partial_match_letter_count: float = 0.5,
+    ) -> None:
+        nltk.data.path.append(os.path.join(checkpoint_dir, "nltk_data"))
+        self.lemmatizer = nltk.WordNetLemmatizer()
+        self.profanity = profanity
+        self.checkpoint_dir = checkpoint_dir
+        self.guardrail_partial_match_min_chars = guardrail_partial_match_min_chars
+        self.guardrail_partial_match_letter_count = guardrail_partial_match_letter_count
+        # Load blocklist and whitelist keywords
+        self.blocklist_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "custom"))
+        self.whitelist_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "whitelist"))
+        self.exact_match_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "exact_match"))
+        self.profanity.load_censor_words(custom_words=self.blocklist_words, whitelist_words=self.whitelist_words)
+        log.debug(f"Loaded {len(self.blocklist_words)} words/phrases from blocklist")
+        log.debug(f"Whitelisted {len(self.whitelist_words)} words/phrases from whitelist")
+        log.debug(f"Loaded {len(self.exact_match_words)} exact match words/phrases from blocklist")
+    def uncensor_whitelist(self, input_prompt: str, censored_prompt: str) -> str:
+        """Explicitly uncensor words that are in the whitelist."""
+        input_words = input_prompt.split()
+        censored_words = censored_prompt.split()
+        whitelist_words = set(self.whitelist_words)
+        for i, token in enumerate(input_words):
+            if token.strip(string.punctuation).lower() in whitelist_words:
+                censored_words[i] = token
+        censored_prompt = " ".join(censored_words)
+        return censored_prompt
+    def censor_prompt(self, input_prompt: str) -> tuple[bool, str]:
+        """Censor the prompt using the blocklist with better-profanity fuzzy matching.
+        Args:
+            input_prompt: input prompt to censor
+        Returns:
+            bool: True if the prompt is blocked, False otherwise
+            str: A message indicating why the prompt was blocked
+        """
+        censored_prompt = self.profanity.censor(input_prompt, censor_char=CENSOR)
+        # Uncensor whitelisted words that were censored from blocklist fuzzy matching
+        censored_prompt = self.uncensor_whitelist(input_prompt, censored_prompt)
+        if CENSOR in censored_prompt:
+            return True, f"Prompt blocked by censorship: Censored Prompt: {censored_prompt}"
+        return False, ""
+    @staticmethod
+    def check_partial_match(
+        normalized_prompt: str, normalized_word: str, guardrail_partial_match_letter_count: float
+    ) -> tuple[bool, str]:
+        """
+        Check robustly if normalized word and the matching target have a difference of up to guardrail_partial_match_letter_count characters.
+        Args:
+            normalized_prompt: a string with many words
+            normalized_word: a string with one or multiple words, its length is smaller than normalized_prompt
+            guardrail_partial_match_letter_count: maximum allowed difference in characters (float to allow partial characters)
+        Returns:
+            bool: True if a match is found, False otherwise
+            str: A message indicating why the prompt was blocked
+        """
+        prompt_words = normalized_prompt.split()
+        word_length = len(normalized_word.split())
+        max_similarity_ratio = (len(normalized_word) - float(guardrail_partial_match_letter_count)) / float(
+            len(normalized_word)
+        )
+        for i in range(len(prompt_words) - word_length + 1):
+            # Extract a substring from the prompt with the same number of words as the normalized_word
+            substring = " ".join(prompt_words[i : i + word_length])
+            similarity_ratio = SequenceMatcher(None, substring, normalized_word).ratio()
+            if similarity_ratio >= max_similarity_ratio:
+                return (
+                    True,
+                    f"Prompt blocked by partial match blocklist: Prompt: {normalized_prompt}, Partial Match Word: {normalized_word}",
+                )
+        return False, ""
+    @staticmethod
+    def check_against_whole_word_blocklist(
+        prompt: str,
+        blocklist: list[str],
+        guardrail_partial_match_min_chars: int = 4,
+        guardrail_partial_match_letter_count: float = 0.5,
+    ) -> bool:
+        """
+        Check if the prompt contains any whole words from the blocklist.
+        The match is case insensitive and robust to multiple spaces between words.
+        Args:
+            prompt: input prompt to check
+            blocklist: list of words to check against
+            guardrail_partial_match_min_chars: minimum number of characters in a word to check for partial match
+            guardrail_partial_match_letter_count: maximum allowed difference in characters for partial match
+        Returns:
+            bool: True if a match is found, False otherwise
+            str: A message indicating why the prompt was blocked
+        """
+        # Normalize spaces and convert to lowercase
+        normalized_prompt = re.sub(r"\s+", " ", prompt).strip().lower()
+        for word in blocklist:
+            # Normalize spaces and convert to lowercase for each blocklist word
+            normalized_word = re.sub(r"\s+", " ", word).strip().lower()
+            # Use word boundaries to ensure whole word match
+            if re.search(r"\b" + re.escape(normalized_word) + r"\b", normalized_prompt):
+                return True, f"Prompt blocked by exact match blocklist: Prompt: {prompt}, Exact Match Word: {word}"
+            # Check for partial match if the word is long enough
+            if len(normalized_word) >= guardrail_partial_match_min_chars:
+                match, message = Blocklist.check_partial_match(
+                    normalized_prompt, normalized_word, guardrail_partial_match_letter_count
+                )
+                if match:
+                    return True, message
+        return False, ""
+    def is_safe(self, input_prompt: str = "") -> tuple[bool, str]:
+        """Check if the input prompt is safe using the blocklist."""
+        # Check if the input is empty
+        if not input_prompt:
+            return False, "Input is empty"
+        input_prompt = to_ascii(input_prompt)
+        # Check full sentence for censored words
+        censored, message = self.censor_prompt(input_prompt)
+        if censored:
+            return False, message
+        # Check lemmatized words for censored words
+        tokens = nltk.word_tokenize(input_prompt)
+        lemmas = [self.lemmatizer.lemmatize(token) for token in tokens]
+        lemmatized_prompt = " ".join(lemmas)
+        censored, message = self.censor_prompt(lemmatized_prompt)
+        if censored:
+            return False, message
+        # Check for exact match blocklist words
+        censored, message = self.check_against_whole_word_blocklist(
+            input_prompt,
+            self.exact_match_words,
+            self.guardrail_partial_match_min_chars,
+            self.guardrail_partial_match_letter_count,
+        )
+        if censored:
+            return False, message
+        # If all these checks pass, the input is safe
+        return True, "Input is safe"
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        help="Path to the Blocklist checkpoint folder",
+        default=DEFAULT_CHECKPOINT_DIR,
+    )
+    return parser.parse_args()
+def main(args):
+    blocklist = Blocklist(checkpoint_dir=args.checkpoint_dir)
+    runner = GuardrailRunner(safety_models=[blocklist])
+    with timer("blocklist safety check"):
+        safety, message = runner.run_safety_check(args.prompt)
+    log.info(f"Input is: {'SAFE' if safety else 'UNSAFE'}")
+    log.info(f"Message: {message}") if not safety else None
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

blocks.py ADDED Viewed

	@@ -0,0 +1,545 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional
+import numpy as np
+import torch
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+from torch import nn
+from .attention import Attention, GPT2FeedForward
+from .log import log
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class Timesteps(nn.Module):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.num_channels = num_channels
+    def forward(self, timesteps):
+        in_dype = timesteps.dtype
+        half_dim = self.num_channels // 2
+        exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device)
+        exponent = exponent / (half_dim - 0.0)
+        emb = torch.exp(exponent)
+        emb = timesteps[:, None].float() * emb[None, :]
+        sin_emb = torch.sin(emb)
+        cos_emb = torch.cos(emb)
+        emb = torch.cat([cos_emb, sin_emb], dim=-1)
+        return emb.to(in_dype)
+class TimestepEmbedding(nn.Module):
+    def __init__(self, in_features: int, out_features: int, use_adaln_lora: bool = False):
+        super().__init__()
+        log.debug(
+            f"Using AdaLN LoRA Flag:  {use_adaln_lora}. We enable bias if no AdaLN LoRA for backward compatibility."
+        )
+        self.linear_1 = nn.Linear(in_features, out_features, bias=not use_adaln_lora)
+        self.activation = nn.SiLU()
+        self.use_adaln_lora = use_adaln_lora
+        if use_adaln_lora:
+            self.linear_2 = nn.Linear(out_features, 3 * out_features, bias=False)
+        else:
+            self.linear_2 = nn.Linear(out_features, out_features, bias=True)
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
+        emb = self.linear_1(sample)
+        emb = self.activation(emb)
+        emb = self.linear_2(emb)
+        if self.use_adaln_lora:
+            adaln_lora_B_3D = emb
+            emb_B_D = sample
+        else:
+            emb_B_D = emb
+            adaln_lora_B_3D = None
+        return emb_B_D, adaln_lora_B_3D
+class FourierFeatures(nn.Module):
+    """
+    Implements a layer that generates Fourier features from input tensors, based on randomly sampled
+    frequencies and phases. This can help in learning high-frequency functions in low-dimensional problems.
+    [B] -> [B, D]
+    Parameters:
+        num_channels (int): The number of Fourier features to generate.
+        bandwidth (float, optional): The scaling factor for the frequency of the Fourier features. Defaults to 1.
+        normalize (bool, optional): If set to True, the outputs are scaled by sqrt(2), usually to normalize
+                                    the variance of the features. Defaults to False.
+    Example:
+        >>> layer = FourierFeatures(num_channels=256, bandwidth=0.5, normalize=True)
+        >>> x = torch.randn(10, 256)  # Example input tensor
+        >>> output = layer(x)
+        >>> print(output.shape)  # Expected shape: (10, 256)
+    """
+    def __init__(self, num_channels, bandwidth=1, normalize=False):
+        super().__init__()
+        self.register_buffer("freqs", 2 * np.pi * bandwidth * torch.randn(num_channels), persistent=True)
+        self.register_buffer("phases", 2 * np.pi * torch.rand(num_channels), persistent=True)
+        self.gain = np.sqrt(2) if normalize else 1
+    def forward(self, x, gain: float = 1.0):
+        """
+        Apply the Fourier feature transformation to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+            gain (float, optional): An additional gain factor applied during the forward pass. Defaults to 1.
+        Returns:
+            torch.Tensor: The transformed tensor, with Fourier features applied.
+        """
+        in_dtype = x.dtype
+        x = x.to(torch.float32).ger(self.freqs.to(torch.float32)).add(self.phases.to(torch.float32))
+        x = x.cos().mul(self.gain * gain).to(in_dtype)
+        return x
+class PatchEmbed(nn.Module):
+    """
+    PatchEmbed is a module for embedding patches from an input tensor by applying either 3D or 2D convolutional layers,
+    depending on the . This module can process inputs with temporal (video) and spatial (image) dimensions,
+    making it suitable for video and image processing tasks. It supports dividing the input into patches
+    and embedding each patch into a vector of size `out_channels`.
+    Parameters:
+    - spatial_patch_size (int): The size of each spatial patch.
+    - temporal_patch_size (int): The size of each temporal patch.
+    - in_channels (int): Number of input channels. Default: 3.
+    - out_channels (int): The dimension of the embedding vector for each patch. Default: 768.
+    - bias (bool): If True, adds a learnable bias to the output of the convolutional layers. Default: True.
+    """
+    def __init__(
+        self,
+        spatial_patch_size,
+        temporal_patch_size,
+        in_channels=3,
+        out_channels=768,
+        bias=True,
+    ):
+        super().__init__()
+        self.spatial_patch_size = spatial_patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.proj = nn.Sequential(
+            Rearrange(
+                "b c (t r) (h m) (w n) -> b t h w (c r m n)",
+                r=temporal_patch_size,
+                m=spatial_patch_size,
+                n=spatial_patch_size,
+            ),
+            nn.Linear(
+                in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size, out_channels, bias=bias
+            ),
+        )
+        self.out = nn.Identity()
+    def forward(self, x):
+        """
+        Forward pass of the PatchEmbed module.
+        Parameters:
+        - x (torch.Tensor): The input tensor of shape (B, C, T, H, W) where
+            B is the batch size,
+            C is the number of channels,
+            T is the temporal dimension,
+            H is the height, and
+            W is the width of the input.
+        Returns:
+        - torch.Tensor: The embedded patches as a tensor, with shape b t h w c.
+        """
+        assert x.dim() == 5
+        _, _, T, H, W = x.shape
+        assert H % self.spatial_patch_size == 0 and W % self.spatial_patch_size == 0
+        assert T % self.temporal_patch_size == 0
+        x = self.proj(x)
+        return self.out(x)
+class FinalLayer(nn.Module):
+    """
+    The final layer of video DiT.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        spatial_patch_size,
+        temporal_patch_size,
+        out_channels,
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+    ):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False
+        )
+        self.hidden_size = hidden_size
+        self.n_adaln_chunks = 2
+        self.use_adaln_lora = use_adaln_lora
+        if use_adaln_lora:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(hidden_size, adaln_lora_dim, bias=False),
+                nn.Linear(adaln_lora_dim, self.n_adaln_chunks * hidden_size, bias=False),
+            )
+        else:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(), nn.Linear(hidden_size, self.n_adaln_chunks * hidden_size, bias=False)
+            )
+    def forward(
+        self,
+        x_BT_HW_D,
+        emb_B_D,
+        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+    ):
+        if self.use_adaln_lora:
+            assert adaln_lora_B_3D is not None
+            shift_B_D, scale_B_D = (self.adaLN_modulation(emb_B_D) + adaln_lora_B_3D[:, : 2 * self.hidden_size]).chunk(
+                2, dim=1
+            )
+        else:
+            shift_B_D, scale_B_D = self.adaLN_modulation(emb_B_D).chunk(2, dim=1)
+        B = emb_B_D.shape[0]
+        T = x_BT_HW_D.shape[0] // B
+        shift_BT_D, scale_BT_D = repeat(shift_B_D, "b d -> (b t) d", t=T), repeat(scale_B_D, "b d -> (b t) d", t=T)
+        x_BT_HW_D = modulate(self.norm_final(x_BT_HW_D), shift_BT_D, scale_BT_D)
+        x_BT_HW_D = self.linear(x_BT_HW_D)
+        return x_BT_HW_D
+class VideoAttn(nn.Module):
+    """
+    Implements video attention with optional cross-attention capabilities.
+    This module processes video features while maintaining their spatio-temporal structure. It can perform
+    self-attention within the video features or cross-attention with external context features.
+    Parameters:
+        x_dim (int): Dimension of input feature vectors
+        context_dim (Optional[int]): Dimension of context features for cross-attention. None for self-attention
+        num_heads (int): Number of attention heads
+        bias (bool): Whether to include bias in attention projections. Default: False
+        qkv_norm_mode (str): Normalization mode for query/key/value projections. Must be "per_head". Default: "per_head"
+        x_format (str): Format of input tensor. Must be "BTHWD". Default: "BTHWD"
+    Input shape:
+        - x: (T, H, W, B, D) video features
+        - context (optional): (M, B, D) context features for cross-attention
+        where:
+            T: temporal dimension
+            H: height
+            W: width
+            B: batch size
+            D: feature dimension
+            M: context sequence length
+    """
+    def __init__(
+        self,
+        x_dim: int,
+        context_dim: Optional[int],
+        num_heads: int,
+        bias: bool = False,
+        qkv_norm_mode: str = "per_head",
+        x_format: str = "BTHWD",
+    ) -> None:
+        super().__init__()
+        self.x_format = x_format
+        self.attn = Attention(
+            x_dim,
+            context_dim,
+            num_heads,
+            x_dim // num_heads,
+            qkv_bias=bias,
+            qkv_norm="RRI",
+            out_bias=bias,
+            qkv_norm_mode=qkv_norm_mode,
+            qkv_format="sbhd",
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        crossattn_mask: Optional[torch.Tensor] = None,
+        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass for video attention.
+        Args:
+            x (Tensor): Input tensor of shape (B, T, H, W, D) or (T, H, W, B, D) representing batches of video data.
+            context (Tensor): Context tensor of shape (B, M, D) or (M, B, D),
+            where M is the sequence length of the context.
+            crossattn_mask (Optional[Tensor]): An optional mask for cross-attention mechanisms.
+            rope_emb_L_1_1_D (Optional[Tensor]):
+            Rotary positional embedding tensor of shape (L, 1, 1, D). L == THW for current video training.
+        Returns:
+            Tensor: The output tensor with applied attention, maintaining the input shape.
+        """
+        x_T_H_W_B_D = x
+        context_M_B_D = context
+        T, H, W, B, D = x_T_H_W_B_D.shape
+        x_THW_B_D = rearrange(x_T_H_W_B_D, "t h w b d -> (t h w) b d")
+        x_THW_B_D = self.attn(
+            x_THW_B_D,
+            context_M_B_D,
+            crossattn_mask,
+            rope_emb=rope_emb_L_1_1_D,
+        )
+        x_T_H_W_B_D = rearrange(x_THW_B_D, "(t h w) b d -> t h w b d", h=H, w=W)
+        return x_T_H_W_B_D
+def adaln_norm_state(norm_state, x, scale, shift):
+    normalized = norm_state(x)
+    return normalized * (1 + scale) + shift
+class DITBuildingBlock(nn.Module):
+    """
+    A building block for the DiT (Diffusion Transformer) architecture that supports different types of
+    attention and MLP operations with adaptive layer normalization.
+    Parameters:
+        block_type (str): Type of block - one of:
+            - "cross_attn"/"ca": Cross-attention
+            - "full_attn"/"fa": Full self-attention
+            - "mlp"/"ff": MLP/feedforward block
+        x_dim (int): Dimension of input features
+        context_dim (Optional[int]): Dimension of context features for cross-attention
+        num_heads (int): Number of attention heads
+        mlp_ratio (float): MLP hidden dimension multiplier. Default: 4.0
+        bias (bool): Whether to use bias in layers. Default: False
+        mlp_dropout (float): Dropout rate for MLP. Default: 0.0
+        qkv_norm_mode (str): QKV normalization mode. Default: "per_head"
+        x_format (str): Input tensor format. Default: "BTHWD"
+        use_adaln_lora (bool): Whether to use AdaLN-LoRA. Default: False
+        adaln_lora_dim (int): Dimension for AdaLN-LoRA. Default: 256
+    """
+    def __init__(
+        self,
+        block_type: str,
+        x_dim: int,
+        context_dim: Optional[int],
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        bias: bool = False,
+        mlp_dropout: float = 0.0,
+        qkv_norm_mode: str = "per_head",
+        x_format: str = "BTHWD",
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+    ) -> None:
+        block_type = block_type.lower()
+        super().__init__()
+        self.x_format = x_format
+        if block_type in ["cross_attn", "ca"]:
+            self.block = VideoAttn(
+                x_dim,
+                context_dim,
+                num_heads,
+                bias=bias,
+                qkv_norm_mode=qkv_norm_mode,
+                x_format=self.x_format,
+            )
+        elif block_type in ["full_attn", "fa"]:
+            self.block = VideoAttn(
+                x_dim, None, num_heads, bias=bias, qkv_norm_mode=qkv_norm_mode, x_format=self.x_format
+            )
+        elif block_type in ["mlp", "ff"]:
+            self.block = GPT2FeedForward(x_dim, int(x_dim * mlp_ratio), dropout=mlp_dropout, bias=bias)
+        else:
+            raise ValueError(f"Unknown block type: {block_type}")
+        self.block_type = block_type
+        self.use_adaln_lora = use_adaln_lora
+        self.norm_state = nn.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6)
+        self.n_adaln_chunks = 3
+        if use_adaln_lora:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(x_dim, adaln_lora_dim, bias=False),
+                nn.Linear(adaln_lora_dim, self.n_adaln_chunks * x_dim, bias=False),
+            )
+        else:
+            self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(x_dim, self.n_adaln_chunks * x_dim, bias=False))
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb_B_D: torch.Tensor,
+        crossattn_emb: torch.Tensor,
+        crossattn_mask: Optional[torch.Tensor] = None,
+        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
+        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass for dynamically configured blocks with adaptive normalization.
+        Args:
+            x (Tensor): Input tensor of shape (B, T, H, W, D) or (T, H, W, B, D).
+            emb_B_D (Tensor): Embedding tensor for adaptive layer normalization modulation.
+            crossattn_emb (Tensor): Tensor for cross-attention blocks.
+            crossattn_mask (Optional[Tensor]): Optional mask for cross-attention.
+            rope_emb_L_1_1_D (Optional[Tensor]):
+            Rotary positional embedding tensor of shape (L, 1, 1, D). L == THW for current video training.
+        Returns:
+            Tensor: The output tensor after processing through the configured block and adaptive normalization.
+        """
+        if self.use_adaln_lora:
+            shift_B_D, scale_B_D, gate_B_D = (self.adaLN_modulation(emb_B_D) + adaln_lora_B_3D).chunk(
+                self.n_adaln_chunks, dim=1
+            )
+        else:
+            shift_B_D, scale_B_D, gate_B_D = self.adaLN_modulation(emb_B_D).chunk(self.n_adaln_chunks, dim=1)
+        shift_1_1_1_B_D, scale_1_1_1_B_D, gate_1_1_1_B_D = (
+            shift_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
+            scale_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
+            gate_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
+        )
+        if self.block_type in ["mlp", "ff"]:
+            x = x + gate_1_1_1_B_D * self.block(
+                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
+            )
+        elif self.block_type in ["full_attn", "fa"]:
+            x = x + gate_1_1_1_B_D * self.block(
+                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
+                context=None,
+                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+            )
+        elif self.block_type in ["cross_attn", "ca"]:
+            x = x + gate_1_1_1_B_D * self.block(
+                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
+                context=crossattn_emb,
+                crossattn_mask=crossattn_mask,
+                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+            )
+        else:
+            raise ValueError(f"Unknown block type: {self.block_type}")
+        return x
+class GeneralDITTransformerBlock(nn.Module):
+    """
+    A wrapper module that manages a sequence of DITBuildingBlocks to form a complete transformer layer.
+    Each block in the sequence is specified by a block configuration string.
+    Parameters:
+        x_dim (int): Dimension of input features
+        context_dim (int): Dimension of context features for cross-attention blocks
+        num_heads (int): Number of attention heads
+        block_config (str): String specifying block sequence (e.g. "ca-fa-mlp" for cross-attention,
+                          full-attention, then MLP)
+        mlp_ratio (float): MLP hidden dimension multiplier. Default: 4.0
+        x_format (str): Input tensor format. Default: "BTHWD"
+        use_adaln_lora (bool): Whether to use AdaLN-LoRA. Default: False
+        adaln_lora_dim (int): Dimension for AdaLN-LoRA. Default: 256
+    The block_config string uses "-" to separate block types:
+        - "ca"/"cross_attn": Cross-attention block
+        - "fa"/"full_attn": Full self-attention block
+        - "mlp"/"ff": MLP/feedforward block
+    Example:
+        block_config = "ca-fa-mlp" creates a sequence of:
+        1. Cross-attention block
+        2. Full self-attention block
+        3. MLP block
+    """
+    def __init__(
+        self,
+        x_dim: int,
+        context_dim: int,
+        num_heads: int,
+        block_config: str,
+        mlp_ratio: float = 4.0,
+        x_format: str = "BTHWD",
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        self.x_format = x_format
+        for block_type in block_config.split("-"):
+            self.blocks.append(
+                DITBuildingBlock(
+                    block_type,
+                    x_dim,
+                    context_dim,
+                    num_heads,
+                    mlp_ratio,
+                    x_format=self.x_format,
+                    use_adaln_lora=use_adaln_lora,
+                    adaln_lora_dim=adaln_lora_dim,
+                )
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb_B_D: torch.Tensor,
+        crossattn_emb: torch.Tensor,
+        crossattn_mask: Optional[torch.Tensor] = None,
+        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
+        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if extra_per_block_pos_emb is not None:
+            x = x + extra_per_block_pos_emb
+        for block in self.blocks:
+            x = block(
+                x,
+                emb_B_D,
+                crossattn_emb,
+                crossattn_mask,
+                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+                adaln_lora_B_3D=adaln_lora_B_3D,
+            )
+        return x

blur_utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+def pixelate_face(face_img: np.ndarray, blocks: int = 5) -> np.ndarray:
+    """
+    Pixelate a face region by reducing resolution and then upscaling.
+    Args:
+        face_img: Face region to pixelate
+        blocks: Number of blocks to divide the face into (in each dimension)
+    Returns:
+        Pixelated face region
+    """
+    h, w = face_img.shape[:2]
+    # Shrink the image and scale back up to create pixelation effect
+    temp = cv2.resize(face_img, (blocks, blocks), interpolation=cv2.INTER_LINEAR)
+    pixelated = cv2.resize(temp, (w, h), interpolation=cv2.INTER_NEAREST)
+    return pixelated

categories.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+VIOLENCE = {
+    "category": "violence",
+    "blocked_reason": """This prompt is blocked due to the presence of violent content.""",
+    "instruction": """O1: Violence.
+Should not
+- Help people plan or engage in violence.
+- Encourage people to engage in violence.
+- Contain unpleasant vivid violent imagery or violent description that arouses discomfort in the reader.
+Can
+- Provide information on violence.
+- Discuss the topics of violence, and can discuss historical events involving violence.
+""",
+}
+SEXUAL = {
+    "category": "sexual",
+    "blocked_reason": """This prompt is blocked due to the presence of sexual content.""",
+    "instruction": """O2: Sexual.
+Should not
+- Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
+- Engage or encourage sexual activity that is considered illegal.
+Can
+- Discuss topics related to sexuality and sexual education.
+- Provide general information regarding sexual activities.
+- Provide guidance on topics related to sexual health.
+""",
+}
+CRIMINAL_PLANNING = {
+    "category": "criminal_planning",
+    "blocked_reason": """This prompt is blocked due to the presence of criminal planning/confessions content.""",
+    "instruction": """O3: Criminal Planning/Confessions.
+Should not
+- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
+- Encourage people to engage in criminal activities or condone criminal behavior.
+- Have a confession of crime that was committed or will be committed in future.
+Can
+- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
+- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.
+""",
+}
+WEAPONS = {
+    "category": "weapons",
+    "blocked_reason": """This prompt is blocked due to the presence of weapons content.""",
+    "instruction": """O4: Guns and Illegal Weapons.
+Should not
+- Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
+- Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
+- Assist or encourage people to execute or own other illegal weapons for example biological agents, or chemical weapons, including instructions on how to create or use them.
+Can
+- Discuss firearms and the arguments for and against firearm ownership.
+""",
+}
+SUBSTANCE_ABUSE = {
+    "category": "substance_abuse",
+    "blocked_reason": """This prompt is blocked due to the presence of substance abuse content.""",
+    "instruction": """O5: Controlled/Regulated Substances.
+Should not
+- Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
+- Assist or encourage people to create such substances when it is illegal to do so.
+Can
+- Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
+- Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.
+""",
+}
+SUICIDE = {
+    "category": "suicide",
+    "blocked_reason": "This prompt is blocked due to the presence of suicide and self harm content.",
+    "instruction": """O6: Suicide and Self Harm.
+Should not
+- Encourage people to harm themselves.
+- Romanticize or condone self-harm or suicide.
+- Provide information regarding the methods of suicide or self-harm.
+- Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
+- Contain confessions or intent of self harm
+Can
+- Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
+Should
+- Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.
+""",
+}
+CSAM = {
+    "category": "csam",
+    "blocked_reason": "This prompt contains material related to Child Sexual Abuse",
+    "instruction": """O7: Sexual (minor).
+Should not
+- Assist, engage or encourage intention to engage in sexual activity with an underage/minor.
+- Discuss or plan sexual acts, sharing of media of sexual nature, personal information or intention to meet in person for sexual activity with a minor.
+- Should not include discussion where private information is shared with underage minors for purposes of acting on them and taking actions that are sexual in nature including meeting or sharing media.
+- Should not discuss child abuse or engage, encourage, assist in sexual grooming/trafficking of children.
+Can
+- Provide information about sex education for use in schools.
+""",
+}
+HATRED = {
+    "category": "hatred",
+    "blocked_reason": "This prompt is blocked due to the presence of hate speech content.",
+    "instruction": """O8: Hate/Identity Hate.
+Should not
+- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their
+race, color, ethnicity, religion, national origin, sexual orientation, gender, gender identity, or disability or caste.
+- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
+- Should not discuss prejudice towards a group classified as protected according to the law of Germany, UK, USA or any other country.
+- Should not contain slurs.
+- Should not contain text that is distasteful, offensive, or hateful that causes others to feel grief, anger, or fear.
+Can
+- Provide information on discrimination.
+- Discuss the topics of discrimination, and can discuss historical events involving discrimination.
+- Discuss neutral statements referring to group identity and  contextualized hate speech (that talks about hate but does not incite hate).
+""",
+}
+HARASSMENT = {
+    "category": "harassment",
+    "blocked_reason": "This prompt is blocked due to the presence of harassment content.",
+    "instruction": """O10: Harassment.
+Should not
+- Have content that may be used to torment or annoy individuals in real life, or make harassment more likely to occur.
+- Engage in harassment, misconduct and unlawful discrimination
+- Single out an individual or marginalization and/or retaliation based on protected characteristics like Race, Color, Gender, Sex,
+Sexual orientation, Gender identity, and gender expression, National origin, Ethnicity, Disability (including being regarded as disabled),
+Religion, Age (40+), Pregnancy (including pregnancy, childbirth or related medical conditions), Genetic information, Military or veteran status, Citizenship status, Political activity, or affiliation,
+- Taking or requesting statutorily protected leave Body characteristics.
+- Promise to give a benefit, or a threat to retaliate or take an adverse action based on a response to harassment and misconduct.
+- Bullying.
+- Being derogatory and humiliating toward an individual but not necessarily protected characteristics under law.
+- Making rude or insulting comments, demeaning, and objectifying terms toward an individual.
+Can
+- Discuss negative impacts of harassment and bullying based on protected characteristics like race, color, gender, sex, nationality, within a contextualized setting.
+- Provide information on protected characteristics or refers, cites material that describes what protected characteristics are.
+- Provide information or refer or cite material that informs what harassment is.
+""",
+}
+THREAT = {
+    "category": "threat",
+    "blocked_reason": "This prompt is blocked due to the presence of threat content.",
+    "instruction": """O11: Threat.
+Should not
+- Discuss a communication of intent to inflict harm or loss on another person.
+- Should not contain discussion of intention of intimidation which  is a tactic used between conflicting parties to make the other timid or psychologically insecure for coercion or control.
+- Should not contain intention of knowingly putting another person in fear of bodily injury.
+- Should not assist or plan injury, physical or mental damage, act or instance of injury, or material  or any kind of loss to a person.
+- Should not contain threats that are forbidden by law as intent to obtain a monetary advantage or to compel a person to act against their will.
+Can
+- Provide information about threat is and the meaning of threat.
+""",
+}
+PROFANITY = {
+    "category": "profanity",
+    "blocked_reason": "This prompt is blocked due to the presence of profanity content.",
+    "instruction": """O12: Profanity.
+Should not
+- Contain swear words, curse words, or other obscene or profane language
+- This includes offensive words used without any intention to act on them or without any explicit targeted recipient
+""",
+}
+UNSAFE_CATEGORIES = [
+    VIOLENCE,
+    SEXUAL,
+    CRIMINAL_PLANNING,
+    WEAPONS,
+    SUBSTANCE_ABUSE,
+    SUICIDE,
+    CSAM,
+    HATRED,
+    HARASSMENT,
+    THREAT,
+    PROFANITY,
+]

checkpoint.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional
+import torch
+# Substrings to ignore when processing state dicts
+substrings_to_ignore = [
+    "_extra_state",  # Extra states (BytesIO type) added by TransformerEngine for FP8 handling
+]
+def get_partial_state_dict(
+    state_dict: Dict[str, torch.Tensor],
+    prefix: str,
+) -> Dict[str, torch.Tensor]:
+    """
+    Get a partial state dict with keys starting with the given prefix
+    """
+    return {k: v for k, v in state_dict.items() if k.startswith(prefix)}
+def process_state_dict(
+    state_dict: Dict[str, torch.Tensor],
+    device: str = None,
+    dtype: torch.dtype = None,
+    prefix_to_remove: Optional[str] = None,
+) -> Dict[str, torch.Tensor]:
+    """
+    - Remove items with substring "_extra_state" in keys (TransformerEngine adds these for FP8)
+    - Move tensors to specified device and dtype if provided
+    Args:
+        state_dict (Dict[str, torch.Tensor]): The state dict to process
+        device (str, optional): The device to move tensors to. Defaults to None.
+        dtype (torch.dtype, optional): The dtype to move tensors to. Defaults to None.
+        prefix_to_remove (str, optional): The prefix to remove from the keys of the state dict. Defaults to None.
+    Returns:
+        Dict[str, torch.Tensor]: The processed state dict
+    """
+    new_state_dict = {}
+    tensor_kwargs = {}
+    if device is not None:
+        tensor_kwargs["device"] = device
+    if dtype is not None:
+        tensor_kwargs["dtype"] = dtype
+    for key, value in state_dict.items():
+        # Check if any of the substrings to ignore are in the key
+        skip = False
+        for substr in substrings_to_ignore:
+            if substr in key:
+                skip = True
+                break
+        if skip:
+            continue
+        if len(tensor_kwargs) > 0:
+            value = value.to(**tensor_kwargs)
+        if prefix_to_remove is not None and key.startswith(prefix_to_remove):
+            key = key[len(prefix_to_remove) :]
+        new_state_dict[key] = value
+    return new_state_dict

conditioner.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from dataclasses import dataclass, fields
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from .batch_ops import batch_mul
+from .log import log
+from .lazy_config_init import instantiate
+class BaseConditionEntry(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._dropout_rate = None
+        self._input_key = None
+        self._return_dict = False
+    @property
+    def dropout_rate(self) -> Union[float, torch.Tensor]:
+        return self._dropout_rate
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+    @property
+    def is_return_dict(self) -> bool:
+        return self._return_dict
+    @dropout_rate.setter
+    def dropout_rate(self, value: Union[float, torch.Tensor]):
+        self._dropout_rate = value
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+    @is_return_dict.setter
+    def is_return_dict(self, value: bool):
+        self._return_dict = value
+    @dropout_rate.deleter
+    def dropout_rate(self):
+        del self._dropout_rate
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+    @is_return_dict.deleter
+    def is_return_dict(self):
+        del self._return_dict
+    def random_dropout_input(
+        self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None
+    ) -> torch.Tensor:
+        del key
+        dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate
+        return batch_mul(
+            torch.bernoulli((1.0 - dropout_rate) * torch.ones(in_tensor.shape[0])).type_as(in_tensor),
+            in_tensor,
+        )
+    def summary(self) -> str:
+        pass
+class DataType(Enum):
+    IMAGE = "image"
+    VIDEO = "video"
+class TextAttr(BaseConditionEntry):
+    def __init__(self):
+        super().__init__()
+    def forward(self, token: torch.Tensor, mask: torch.Tensor):
+        return {"crossattn_emb": token, "crossattn_mask": mask}
+    def random_dropout_input(
+        self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None
+    ) -> torch.Tensor:
+        if key is not None and "mask" in key:
+            return in_tensor
+        return super().random_dropout_input(in_tensor, dropout_rate, key)
+@dataclass
+class BaseVideoCondition:
+    crossattn_emb: torch.Tensor
+    crossattn_mask: torch.Tensor
+    data_type: DataType = DataType.VIDEO
+    padding_mask: Optional[torch.Tensor] = None
+    fps: Optional[torch.Tensor] = None
+    num_frames: Optional[torch.Tensor] = None
+    image_size: Optional[torch.Tensor] = None
+    scalar_feature: Optional[torch.Tensor] = None
+    def to_dict(self) -> Dict[str, Optional[torch.Tensor]]:
+        return {f.name: getattr(self, f.name) for f in fields(self)}
+@dataclass
+class VideoExtendCondition(BaseVideoCondition):
+    video_cond_bool: Optional[torch.Tensor] = None  # whether or not it conditioned on video
+    gt_latent: Optional[torch.Tensor] = None
+    condition_video_indicator: Optional[torch.Tensor] = None  # 1 for condition region
+    # condition_video_input_mask will concat to the input of network, along channel dim;
+    # Will be concat with the input tensor
+    condition_video_input_mask: Optional[torch.Tensor] = None
+    # condition_video_augment_sigma: (B, T) tensor of sigma value for the conditional input augmentation, only valid when apply_corruption_to_condition_region is "noise_with_sigma" or "noise_with_sigma_fixed"
+    condition_video_augment_sigma: Optional[torch.Tensor] = None
+class GeneralConditioner(nn.Module, ABC):
+    """
+    An abstract module designed to handle various embedding models with conditional and
+    unconditional configurations. This abstract base class initializes and manages a collection
+    of embedders that can dynamically adjust their dropout rates based on conditioning.
+    Attributes:
+        KEY2DIM (dict): A mapping from output keys to dimensions used for concatenation.
+        embedders (nn.ModuleDict): A dictionary containing all embedded models initialized and
+            configured based on the provided configurations.
+    Parameters:
+        emb_models (Union[List, Any]): A dictionary where keys are embedder names and values
+            are configurations for initializing the embedders.
+    """
+    KEY2DIM = {"crossattn_emb": 1, "crossattn_mask": 1}
+    def __init__(self, **emb_models: Union[List, Any]):
+        super().__init__()
+        self.embedders = nn.ModuleDict()
+        for n, (emb_name, embconfig) in enumerate(emb_models.items()):
+            embedder = instantiate(embconfig.obj)
+            assert isinstance(
+                embedder, BaseConditionEntry
+            ), f"embedder model {embedder.__class__.__name__} has to inherit from AbstractEmbModel"
+            embedder.dropout_rate = getattr(embconfig, "dropout_rate", 0.0)
+            if hasattr(embconfig, "input_key"):
+                embedder.input_key = embconfig.input_key
+            elif hasattr(embconfig, "input_keys"):
+                embedder.input_keys = embconfig.input_keys
+            else:
+                raise KeyError(f"need either 'input_key' or 'input_keys' for embedder {embedder.__class__.__name__}")
+            log.debug(f"Initialized embedder #{n}-{emb_name}: \n {embedder.summary()}")
+            self.embedders[emb_name] = embedder
+    @abstractmethod
+    def forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> Any:
+        """Should be implemented in subclasses to handle conditon datatype"""
+        raise NotImplementedError
+    def _forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> Dict:
+        """
+        Processes the input batch through all configured embedders, applying conditional dropout rates if specified.
+        Output tensors for each key are concatenated along the dimensions specified in KEY2DIM.
+        Parameters:
+            batch (Dict): The input data batch to process.
+            override_dropout_rate (Optional[Dict[str, float]]): Optional dictionary to override default dropout rates
+                                                                per embedder key.
+        Returns:
+            Dict: A dictionary of output tensors concatenated by specified dimensions.
+        Note:
+            In case the network code is sensitive to the order of concatenation, you can either control the order via \
+            config file or make sure the embedders return a unique key for each output.
+        """
+        output = defaultdict(list)
+        if override_dropout_rate is None:
+            override_dropout_rate = {}
+        # make sure emb_name in override_dropout_rate is valid
+        for emb_name in override_dropout_rate.keys():
+            assert emb_name in self.embedders, f"invalid name found {emb_name}"
+        for emb_name, embedder in self.embedders.items():
+            with torch.no_grad():
+                if hasattr(embedder, "input_key") and (embedder.input_key is not None):
+                    emb_out = embedder(
+                        embedder.random_dropout_input(
+                            batch[embedder.input_key], override_dropout_rate.get(emb_name, None)
+                        )
+                    )
+                elif hasattr(embedder, "input_keys"):
+                    emb_out = embedder(
+                        *[
+                            embedder.random_dropout_input(batch[k], override_dropout_rate.get(emb_name, None), k)
+                            for k in embedder.input_keys
+                        ]
+                    )
+            for k, v in emb_out.items():
+                output[k].append(v)
+        # Concatenate the outputs
+        return {k: torch.cat(v, dim=self.KEY2DIM.get(k, -1)) for k, v in output.items()}
+    def get_condition_uncondition(
+        self,
+        data_batch: Dict,
+    ) -> Tuple[Any, Any]:
+        """
+        Processes the provided data batch to generate conditioned and unconditioned outputs.
+        This method manipulates dropout rates to simulate two scenarios:
+        1. All conditions applied (conditioned)
+        2. Conditions removed/reduced to minimum (unconditioned)
+        This method sets dropout rates to zero for the conditioned scenario to fully apply
+        embedders' effects. For unconditioned, it sets rates to 1 (or 0 if initial rate is
+        insignificant) to minimize embedder influences.
+        Parameters:
+            data_batch (Dict): Input data batch containing all necessary information for
+                              embedding processing.
+        Returns:
+            Tuple[Any, Any]: A tuple containing:
+                - Outputs with all embedders fully applied (conditioned)
+                - Outputs with embedders minimized/not applied (unconditioned)
+        """
+        cond_dropout_rates, dropout_rates = {}, {}
+        for emb_name, embedder in self.embedders.items():
+            cond_dropout_rates[emb_name] = 0.0
+            dropout_rates[emb_name] = 1.0 if embedder.dropout_rate > 1e-4 else 0.0
+        condition: Any = self(data_batch, override_dropout_rate=cond_dropout_rates)
+        un_condition: Any = self(data_batch, override_dropout_rate=dropout_rates)
+        return condition, un_condition
+    def get_condition_with_negative_prompt(
+        self,
+        data_batch: Dict,
+    ) -> Tuple[Any, Any]:
+        """
+        Similar functionality as get_condition_uncondition
+        But use negative prompts for unconditon
+        """
+        cond_dropout_rates, uncond_dropout_rates = {}, {}
+        for emb_name, embedder in self.embedders.items():
+            cond_dropout_rates[emb_name] = 0.0
+            if isinstance(embedder, TextAttr):
+                uncond_dropout_rates[emb_name] = 0.0
+            else:
+                uncond_dropout_rates[emb_name] = 1.0 if embedder.dropout_rate > 1e-4 else 0.0
+        data_batch_neg_prompt = copy.deepcopy(data_batch)
+        if "neg_t5_text_embeddings" in data_batch_neg_prompt:
+            if isinstance(data_batch_neg_prompt["neg_t5_text_embeddings"], torch.Tensor):
+                data_batch_neg_prompt["t5_text_embeddings"] = data_batch_neg_prompt["neg_t5_text_embeddings"]
+                data_batch_neg_prompt["t5_text_mask"] = data_batch_neg_prompt["neg_t5_text_mask"]
+        condition: Any = self(data_batch, override_dropout_rate=cond_dropout_rates)
+        un_condition: Any = self(data_batch_neg_prompt, override_dropout_rate=uncond_dropout_rates)
+        return condition, un_condition
+@dataclass
+class CosmosCondition:
+    crossattn_emb: torch.Tensor
+    crossattn_mask: torch.Tensor
+    padding_mask: Optional[torch.Tensor] = None
+    scalar_feature: Optional[torch.Tensor] = None
+    def to_dict(self) -> Dict[str, Optional[torch.Tensor]]:
+        return {f.name: getattr(self, f.name) for f in fields(self)}
+class VideoConditioner(GeneralConditioner):
+    def forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> BaseVideoCondition:
+        output = super()._forward(batch, override_dropout_rate)
+        return BaseVideoCondition(**output)
+class VideoExtendConditioner(GeneralConditioner):
+    def forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> VideoExtendCondition:
+        output = super()._forward(batch, override_dropout_rate)
+        return VideoExtendCondition(**output)

config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "architectures": [
+      "DiffusionText2World"
+    ],
+    "auto_map": {
+      "AutoConfig": "text2world_hf.DiffusionText2WorldConfig",
+      "AutoModel": "text2world_hf.DiffusionText2World"
+    },
+    "model_type": "AutoModel"
+}

config.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import Any, TypeVar
+import attrs
+from omegaconf import DictConfig as LazyDict
+from .misc import Color
+T = TypeVar("T")
+def _is_attrs_instance(obj: object) -> bool:
+    """
+    Helper function to check if an object is an instance of an attrs-defined class.
+    Args:
+        obj: The object to check.
+    Returns:
+        bool: True if the object is an instance of an attrs-defined class, False otherwise.
+    """
+    return hasattr(obj, "__attrs_attrs__")
+def make_freezable(cls: T) -> T:
+    """
+    A decorator that adds the capability to freeze instances of an attrs-defined class.
+    NOTE: This requires the wrapped attrs to be defined with attrs.define(slots=False) because we need
+    to hack on a "_is_frozen" attribute.
+    This decorator enhances an attrs-defined class with the ability to be "frozen" at runtime.
+    Once an instance is frozen, its attributes cannot be changed. It also recursively freezes
+    any attrs-defined objects that are attributes of the class.
+    Usage:
+        @make_freezable
+        @attrs.define(slots=False)
+        class MyClass:
+            attribute1: int
+            attribute2: str
+        obj = MyClass(1, 'a')
+        obj.freeze()  # Freeze the instance
+        obj.attribute1 = 2  # Raises AttributeError
+    Args:
+        cls: The class to be decorated.
+    Returns:
+        The decorated class with added freezing capability.
+    """
+    if not hasattr(cls, "__dict__"):
+        raise TypeError(
+            "make_freezable cannot be used with classes that do not define __dict__. Make sure that the wrapped "
+            "class was defined with `@attrs.define(slots=False)`"
+        )
+    original_setattr = cls.__setattr__
+    def setattr_override(self, key, value) -> None:  # noqa: ANN001
+        """
+        Override __setattr__ to allow modifications during initialization
+        and prevent modifications once the instance is frozen.
+        """
+        if hasattr(self, "_is_frozen") and self._is_frozen and key != "_is_frozen":
+            raise AttributeError("Cannot modify frozen instance")
+        original_setattr(self, key, value)  # type: ignore
+    cls.__setattr__ = setattr_override  # type: ignore
+    def freeze(self: object) -> None:
+        """
+        Freeze the instance and all its attrs-defined attributes.
+        """
+        for _, value in attrs.asdict(self, recurse=False).items():
+            if _is_attrs_instance(value) and hasattr(value, "freeze"):
+                value.freeze()
+        self._is_frozen = True  # type: ignore
+    cls.freeze = freeze  # type: ignore
+    return cls
+def _pretty_print_attrs_instance(obj: object, indent: int = 0, use_color: bool = False) -> str:
+    """
+    Recursively pretty prints attrs objects with color.
+    """
+    assert attrs.has(obj.__class__)
+    lines: list[str] = []
+    for attribute in attrs.fields(obj.__class__):
+        value = getattr(obj, attribute.name)
+        if attrs.has(value.__class__):
+            if use_color:
+                lines.append("   " * indent + Color.cyan("* ") + Color.green(attribute.name) + ":")
+            else:
+                lines.append("   " * indent + "* " + attribute.name + ":")
+            lines.append(_pretty_print_attrs_instance(value, indent + 1, use_color))
+        else:
+            if use_color:
+                lines.append(
+                    "   " * indent + Color.cyan("* ") + Color.green(attribute.name) + ": " + Color.yellow(value)
+                )
+            else:
+                lines.append("   " * indent + "* " + attribute.name + ": " + str(value))
+    return "\n".join(lines)
+@make_freezable
+@attrs.define(slots=False)
+class JobConfig:
+    # Project name.
+    project: str = ""
+    # Experiment name.
+    group: str = ""
+    # Run/job name.
+    name: str = ""
+    @property
+    def path(self) -> str:
+        return f"{self.project}/{self.group}/{self.name}"
+@make_freezable
+@attrs.define(slots=False)
+class Config:
+    """Config for a job.
+    See /README.md/Configuration System for more info.
+    """
+    # Model configs.
+    model: LazyDict
+    # Training job configs.
+    job: JobConfig = attrs.field(factory=JobConfig)
+    def to_dict(self) -> dict[str, Any]:
+        return attrs.asdict(self)
+    def validate(self) -> None:
+        """Validate that the config has all required fields."""
+        assert self.job.project != "", "Project name is required."
+        assert self.job.group != "", "Group name is required."
+        assert self.job.name != "", "Job name is required."

config_base_conditioner.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List, Optional
+import attrs
+import torch
+from .conditioner import BaseConditionEntry, TextAttr, VideoConditioner, VideoExtendConditioner
+from .lazy_config_init import LazyCall as L
+from .lazy_config_init import LazyDict
+@attrs.define(slots=False)
+class TextConfig:
+    obj: LazyDict = L(TextAttr)()  # No arguments
+    dropout_rate: float = 0.2
+    input_keys: List[str] = attrs.field(factory=lambda: ["t5_text_embeddings", "t5_text_mask"])
+class BooleanFlag(BaseConditionEntry):
+    def __init__(self, output_key: Optional[str] = None):
+        super().__init__()
+        self.output_key = output_key
+    def forward(self, *args, **kwargs) -> Dict[str, torch.Tensor]:
+        del args, kwargs
+        key = self.output_key if self.output_key else self.input_key
+        return {key: self.flag}
+    def random_dropout_input(
+        self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None
+    ) -> torch.Tensor:
+        del key
+        dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate
+        self.flag = torch.bernoulli((1.0 - dropout_rate) * torch.ones(1)).bool().to(device=in_tensor.device)
+        return in_tensor
+class ReMapkey(BaseConditionEntry):
+    def __init__(self, output_key: Optional[str] = None, dtype: Optional[str] = None):
+        super().__init__()
+        self.output_key = output_key
+        self.dtype = {
+            None: None,
+            "float": torch.float32,
+            "bfloat16": torch.bfloat16,
+            "half": torch.float16,
+            "float16": torch.float16,
+            "int": torch.int32,
+            "long": torch.int64,
+        }[dtype]
+    def forward(self, element: torch.Tensor) -> Dict[str, torch.Tensor]:
+        key = self.output_key if self.output_key else self.input_key
+        if isinstance(element, torch.Tensor):
+            element = element.to(dtype=self.dtype)
+        return {key: element}
+@attrs.define(slots=False)
+class FPSConfig:
+    """
+    Remap the key from the input dictionary to the output dictionary. For `fps`.
+    """
+    obj: LazyDict = L(ReMapkey)(output_key="fps", dtype=None)
+    dropout_rate: float = 0.0
+    input_key: str = "fps"
+@attrs.define(slots=False)
+class PaddingMaskConfig:
+    """
+    Remap the key from the input dictionary to the output dictionary. For `padding_mask`.
+    """
+    obj: LazyDict = L(ReMapkey)(output_key="padding_mask", dtype=None)
+    dropout_rate: float = 0.0
+    input_key: str = "padding_mask"
+@attrs.define(slots=False)
+class ImageSizeConfig:
+    """
+    Remap the key from the input dictionary to the output dictionary. For `image_size`.
+    """
+    obj: LazyDict = L(ReMapkey)(output_key="image_size", dtype=None)
+    dropout_rate: float = 0.0
+    input_key: str = "image_size"
+@attrs.define(slots=False)
+class NumFramesConfig:
+    """
+    Remap the key from the input dictionary to the output dictionary. For `num_frames`.
+    """
+    obj: LazyDict = L(ReMapkey)(output_key="num_frames", dtype=None)
+    dropout_rate: float = 0.0
+    input_key: str = "num_frames"
+@attrs.define(slots=False)
+class VideoCondBoolConfig:
+    obj: LazyDict = L(BooleanFlag)(output_key="video_cond_bool")
+    dropout_rate: float = 0.2
+    input_key: str = "fps"  # This is a placeholder, we never use this value
+    # Config below are for long video generation only
+    # Sample PPP... from IPPP... sequence
+    sample_tokens_start_from_p_or_i: bool = False
+@attrs.define(slots=False)
+class LatentConditionConfig:
+    """
+    Remap the key from the input dictionary to the output dictionary. For `latent condition`.
+    """
+    obj: LazyDict = L(ReMapkey)(output_key="latent_condition", dtype=None)
+    dropout_rate: float = 0.0
+    input_key: str = "latent_condition"
+@attrs.define(slots=False)
+class LatentConditionSigmaConfig:
+    """
+    Remap the key from the input dictionary to the output dictionary. For `latent condition`.
+    """
+    obj: LazyDict = L(ReMapkey)(output_key="latent_condition_sigma", dtype=None)
+    dropout_rate: float = 0.0
+    input_key: str = "latent_condition_sigma"
+BaseVideoConditionerConfig: LazyDict = L(VideoConditioner)(
+    text=TextConfig(),
+)
+VideoConditionerFpsSizePaddingConfig: LazyDict = L(VideoConditioner)(
+    text=TextConfig(),
+    fps=FPSConfig(),
+    num_frames=NumFramesConfig(),
+    image_size=ImageSizeConfig(),
+    padding_mask=PaddingMaskConfig(),
+)
+VideoExtendConditionerConfig: LazyDict = L(VideoExtendConditioner)(
+    text=TextConfig(),
+    fps=FPSConfig(),
+    num_frames=NumFramesConfig(),
+    image_size=ImageSizeConfig(),
+    padding_mask=PaddingMaskConfig(),
+    video_cond_bool=VideoCondBoolConfig(),
+)

config_helper.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+import pkgutil
+import sys
+from dataclasses import fields as dataclass_fields
+from dataclasses import is_dataclass
+from typing import Any, Dict, Optional
+import attr
+import attrs
+from hydra import compose, initialize
+from hydra.core.config_store import ConfigStore
+from omegaconf import DictConfig, OmegaConf
+from .log import log
+from .config import Config
+from .inference import *
+def is_attrs_or_dataclass(obj) -> bool:
+    """
+    Check if the object is an instance of an attrs class or a dataclass.
+    Args:
+        obj: The object to check.
+    Returns:
+        bool: True if the object is an instance of an attrs class or a dataclass, False otherwise.
+    """
+    return is_dataclass(obj) or attr.has(type(obj))
+def get_fields(obj):
+    """
+    Get the fields of an attrs class or a dataclass.
+    Args:
+        obj: The object to get fields from. Must be an instance of an attrs class or a dataclass.
+    Returns:
+        list: A list of field names.
+    Raises:
+        ValueError: If the object is neither an attrs class nor a dataclass.
+    """
+    if is_dataclass(obj):
+        return [field.name for field in dataclass_fields(obj)]
+    elif attr.has(type(obj)):
+        return [field.name for field in attr.fields(type(obj))]
+    else:
+        raise ValueError("The object is neither an attrs class nor a dataclass.")
+def override(config: Config, overrides: Optional[list[str]] = None) -> Config:
+    """
+    :param config: the instance of class `Config` (usually from `make_config`)
+    :param overrides: list of overrides for config
+    :return: the composed instance of class `Config`
+    """
+    # Store the class of the config for reconstruction after overriding.
+    # config_class = type(config)
+    # Convert Config object to a DictConfig object
+    config_dict = attrs.asdict(config)
+    config_omegaconf = DictConfig(content=config_dict, flags={"allow_objects": True})
+    # Enforce "--" separator between the script arguments and overriding configs.
+    if overrides:
+        if overrides[0] != "--":
+            raise ValueError('Hydra config overrides must be separated with a "--" token.')
+        overrides = overrides[1:]
+    # Use Hydra to handle overrides
+    cs = ConfigStore.instance()
+    cs.store(name="config", node=config_omegaconf)
+    with initialize(version_base=None):
+        config_omegaconf = compose(config_name="config", overrides=overrides)
+        OmegaConf.resolve(config_omegaconf)
+    def config_from_dict(ref_instance: Any, kwargs: Any) -> Any:
+        """
+        Construct an instance of the same type as ref_instance using the provided dictionary or data or unstructured data
+        Args:
+            ref_instance: The reference instance to determine the type and fields when needed
+            kwargs: A dictionary of keyword arguments to use for constructing the new instance or primitive data or unstructured data
+        Returns:
+            Any: A new instance of the same type as ref_instance constructed using the provided kwargs or the primitive data or unstructured data
+        Raises:
+            AssertionError: If the fields do not match or if extra keys are found.
+            Exception: If there is an error constructing the new instance.
+        """
+        is_type = is_attrs_or_dataclass(ref_instance)
+        if not is_type:
+            return kwargs
+        else:
+            ref_fields = set(get_fields(ref_instance))
+            assert isinstance(kwargs, dict) or isinstance(
+                kwargs, DictConfig
+            ), "kwargs must be a dictionary or a DictConfig"
+            keys = set(kwargs.keys())
+            # ref_fields must equal to or include all keys
+            extra_keys = keys - ref_fields
+            assert ref_fields == keys or keys.issubset(
+                ref_fields
+            ), f"Fields mismatch: {ref_fields} != {keys}. Extra keys found: {extra_keys} \n \t when constructing {type(ref_instance)} with {keys}"
+            resolved_kwargs: Dict[str, Any] = {}
+            for f in keys:
+                resolved_kwargs[f] = config_from_dict(getattr(ref_instance, f), kwargs[f])
+            try:
+                new_instance = type(ref_instance)(**resolved_kwargs)
+            except Exception as e:
+                log.error(f"Error when constructing {type(ref_instance)} with {resolved_kwargs}")
+                log.error(e)
+                raise e
+            return new_instance
+    config = config_from_dict(config, config_omegaconf)
+    return config
+def get_config_module(config_file: str) -> str:
+    if not config_file.endswith(".py"):
+        log.error("Config file cannot be specified as module.")
+        log.error("Please provide the path to the Python config file (relative to the Cosmos root).")
+    assert os.path.isfile(config_file), f"Cosmos config file ({config_file}) not found."
+    # Convert to importable module format.
+    config_module = config_file.replace("/", ".").replace(".py", "")
+    return config_module
+def import_all_modules_from_package(package_path: str, reload: bool = False, skip_underscore: bool = True) -> None:
+    """
+    Import all modules from the specified package path recursively.
+    This function is typically used in conjunction with Hydra to ensure that all modules
+    within a specified package are imported, which is necessary for registering configurations.
+    Example usage:
+    ```python
+    import_all_modules_from_package("cosmos1.models.diffusion.config.inference", reload=True, skip_underscore=False)
+    ```
+    Args:
+        package_path (str): The dotted path to the package from which to import all modules.
+        reload (bool): Flag to determine whether to reload modules if they're already imported.
+        skip_underscore (bool): If True, skips importing modules that start with an underscore.
+    """
+    return  # we do not use this function
+    log.debug(f"{'Reloading' if reload else 'Importing'} all modules from package {package_path}")
+    package = importlib.import_module(package_path)
+    package_directory = package.__path__
+    def import_modules_recursively(directory: str, prefix: str) -> None:
+        """
+        Recursively imports or reloads all modules in the given directory.
+        Args:
+            directory (str): The file system path to the current package directory.
+            prefix (str): The module prefix (e.g., 'cosmos1.models.diffusion.config').
+        """
+        for _, module_name, is_pkg in pkgutil.iter_modules([directory]):
+            if skip_underscore and module_name.startswith("_"):
+                log.debug(f"Skipping module {module_name} as it starts with an underscore")
+                continue
+            full_module_name = f"{prefix}.{module_name}"
+            log.debug(f"{'Reloading' if reload else 'Importing'} module {full_module_name}")
+            if full_module_name in sys.modules and reload:
+                importlib.reload(sys.modules[full_module_name])
+            else:
+                importlib.import_module(full_module_name)
+            if is_pkg:
+                sub_package_directory = os.path.join(directory, module_name)
+                import_modules_recursively(sub_package_directory, full_module_name)
+    for directory in package_directory:
+        import_modules_recursively(directory, package_path)

convert_pixtral_ckpt.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert pretrained Pixtral vision model weights to checkpoint and verify the checkpoint loading.
+    Usage:
+    PYTHONPATH=$(pwd) python cosmos1/scripts/convert_pixtral_ckpt.py
+"""
+import argparse
+import json
+import os
+import shutil
+from glob import glob
+import torch
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+def convert_pixtral_checkpoint(checkpoint_dir: str, checkpoint_name: str, vit_type: str):
+    """
+    Main function to convert Pixtral vision model weights to checkpoint and optionally verify and save the converted checkpoint.
+    Args:
+        checkpoint_dir (str): Path to the checkpoint directory
+        checkpoint_name (str): Name of the checkpoint
+        vit_type (str): Type of ViT used in the Pixtral model
+    This function performs the following steps:
+    0. Download the checkpoint from Hugging Face
+    1. Loads the original Pixtral checkpoint
+    2. Splits the checkpoint into vision encoder, projector, and LLM weights
+    3. Reorganizes the weights to match the expected format
+    4. Extracts and verifies the vision encoder configuration
+    5. Optionally verifies the converted checkpoint by loading it into a VisionTransformer
+    6. Optionally saves the converted checkpoint and configuration
+    """
+    save_dir = os.path.join(checkpoint_dir, checkpoint_name)
+    os.makedirs(save_dir, exist_ok=True)
+    # Save the converted checkpoint
+    save_path = os.path.join(save_dir, "model.pt")
+    if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+        print(f"Checkpoint {save_path} already exists and is not empty")
+        return
+    pixtral_ckpt_dir = os.path.join(checkpoint_dir, "Pixtral-12B-2409")
+    os.makedirs(pixtral_ckpt_dir, exist_ok=True)
+    repo_id = "mistralai/Pixtral-12B-2409"
+    print(f"Downloading {repo_id} to {pixtral_ckpt_dir}...")
+    snapshot_download(
+        repo_id=repo_id,
+        allow_patterns=["params.json", "consolidated.safetensors"],
+        local_dir=pixtral_ckpt_dir,
+        local_dir_use_symlinks=False,
+    )
+    orig_dtype = torch.get_default_dtype()
+    dtype = torch.bfloat16
+    torch.set_default_dtype(dtype)
+    # Load checkpoint file
+    ckpt_files = glob(os.path.join(pixtral_ckpt_dir, "*.safetensors"))
+    assert len(ckpt_files) == 1, "ckpt_dir should contain only one file"
+    ckpt_path = ckpt_files[0]
+    ckpt = load_file(ckpt_path)
+    # Split checkpoint into weights of vision encoder, projector, and LLM
+    vit_key_prefix = "vision_encoder."
+    vit_ckpt = {}
+    for key, value in ckpt.items():
+        if key.startswith(vit_key_prefix):
+            vit_ckpt[key.lstrip(vit_key_prefix)] = value
+    projector_key_prefix = "vision_language_adapter."
+    projector_ckpt = {}
+    substring_replacement_map = {
+        "w_in.": "projector.0.",
+        "w_out.": "projector.2.",
+    }
+    for key, value in ckpt.items():
+        if key.startswith(projector_key_prefix):
+            key = key.lstrip(projector_key_prefix)
+            for old, new in substring_replacement_map.items():
+                key = key.replace(old, new)
+            projector_ckpt[key] = value
+    llm_ckpt = {}
+    for key, value in ckpt.items():
+        if key.startswith(vit_key_prefix) or key.startswith(projector_key_prefix):
+            continue
+        llm_ckpt[key] = value
+    vlm_ckpt = {}
+    for key, value in llm_ckpt.items():
+        vlm_ckpt["model." + key] = value
+    for key, value in projector_ckpt.items():
+        vlm_ckpt["mm_projector." + key] = value
+    for key, value in vit_ckpt.items():
+        vlm_ckpt["vision_encoder." + key] = value
+    # Load config
+    config_path = os.path.join(pixtral_ckpt_dir, "params.json")
+    with open(config_path, "r") as f:
+        pixtral_config = json.load(f)
+    # Extract the vision encoder configuration
+    vision_encoder_config = {
+        "dim": pixtral_config["vision_encoder"]["hidden_size"],
+        "num_channels": pixtral_config["vision_encoder"]["num_channels"],
+        "image_size": pixtral_config["vision_encoder"]["image_size"],
+        "patch_size": pixtral_config["vision_encoder"]["patch_size"],
+        "rope_theta": pixtral_config["vision_encoder"]["rope_theta"],
+        "ffn_hidden_size": pixtral_config["vision_encoder"]["intermediate_size"],
+        "n_layers": pixtral_config["vision_encoder"]["num_hidden_layers"],
+        "n_heads": pixtral_config["vision_encoder"]["num_attention_heads"],
+        "n_kv_heads": pixtral_config["vision_encoder"]["num_attention_heads"],
+        "norm_type": "rmsnorm",
+        "norm_eps": pixtral_config["norm_eps"],
+        "image_token_id": pixtral_config["vision_encoder"]["image_token_id"],
+    }
+    # Configuration for the 400M ViT of Pixtral 12B VLM
+    vit_config = dict(
+        dim=1024,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        rope_theta=10000,
+        ffn_hidden_size=4096,
+        n_layers=24,
+        n_heads=16,
+        n_kv_heads=16,
+        norm_type="rmsnorm",
+        norm_eps=1e-5,
+        image_token_id=10,
+    )
+    # Compare the two configurations
+    for key, value in vit_config.items():
+        assert vision_encoder_config[key] == value, f"Mismatch in {key}: {vision_encoder_config[key]} != {value}"
+    llm_config_keys = [
+        "dim",
+        "n_layers",
+        "head_dim",
+        "hidden_dim",
+        "n_heads",
+        "n_kv_heads",
+        "rope_theta",
+        "norm_eps",
+        "vocab_size",
+    ]
+    assert set(list(pixtral_config.keys())) == set(llm_config_keys + ["vision_encoder"]), "Config keys mismatch"
+    replace_map = {
+        "hidden_dim": "ffn_hidden_size",
+    }
+    llm_config = {}
+    for k, v in pixtral_config.items():
+        if k in llm_config_keys:
+            llm_config[replace_map.get(k, k)] = v
+        elif k == "vision_encoder":
+            llm_config["vision_encoder"] = vit_type
+        else:
+            raise ValueError(f"Unknown key: {k}")
+    ckpt_to_save = {"model": vlm_ckpt, "mm_projector": projector_ckpt, "vision_encoder": vit_ckpt}
+    torch.save(ckpt_to_save, save_path)
+    print(f"Model saved to {save_path}")
+    # Save config
+    config_path = os.path.join(save_dir, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(llm_config, f)
+    torch.set_default_dtype(orig_dtype)  # Reset the default dtype
+    # Remove the original Pixtral checkpoint
+    shutil.rmtree(pixtral_ckpt_dir, ignore_errors=True)
+    print(f"Removed {pixtral_ckpt_dir}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert pretrained Pixtral vision model weights to checkpoint and verify accuracy"
+    )
+    parser.add_argument("--checkpoint_dir", type=str, default="checkpoints", help="Path to the checkpoint directory")
+    parser.add_argument(
+        "--checkpoint_name",
+        type=str,
+        default="Pixtral-12B",
+        help="Name of the checkpoint",
+    )
+    parser.add_argument("--vit_type", default="pixtral-12b-vit", help="Type of ViT used in the Pixtral model")
+    args = parser.parse_args()
+    convert_pixtral_checkpoint(
+        checkpoint_dir=args.checkpoint_dir, checkpoint_name=args.checkpoint_name, vit_type=args.vit_type
+    )

cosmos1/models/POST_TRAINING.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Cosmos Post-training
+In the [Cosmos paper](https://research.nvidia.com/publication/2025-01_cosmos-world-foundation-model-platform-physical-ai), we discuss several post-training examples of Cosmos pre-trained World Foundation Models (WFMs) for various Physical AI tasks, including
+- General Post-Training: Fine-tune the WFM to generate a target distribution of videos based on the custom dataset. The target distribution could include a specific camera spec or a specific domain such as a factory.
+- Instruction Control: Post-trains models for robotic manipulation to predict videos based on textual instructions, enabling robots to visually simulate tasks like folding clothes or picking up objects.
+- Action Control: Post-trains models for robotic manipulation to predict the next visual frame based on action vectors, simulating robotic tasks like object handling or movement planning.
+- Camera Control: Adds camera pose conditioning to generate 3D-consistent video simulations from single images, enabling joystick-like navigation in virtual environments.
+- Multi-View Generation: Post-trains models for autonomous vehicles to generate synchronized multi-view videos from text prompts, simulating driving scenarios with multiple camera perspectives.
+- Multi-View Generation with Vehicle Trajectory Control: Extends multi-view generation by incorporating trajectory inputs, enabling precise simulation of driving environments for autonomous vehicles, adhering to specified paths.
+Except for the instruction control where the WFM is post-trained on a dataset of instruction-video pairs, all other cases require minor modifications of the network architectures. Post-training tasks will be supported by NeMo Framework. In this initial release, we provide post-training scripts for the general post-training of both diffusion and autorgressive WFMs. Scripts of the other post-training tasks will be provided in a future release.
+## Post-training Support Matrix
+| Post-training Task  | Diffusion WFM | Autoregressive WFM |
+|---------------------|---------------|--------------------|
+| General post-training | [Supported](../models/diffusion/nemo/post_training/README.md) | [Supported](../models/autoregressive/nemo/post_training/README.md) |
+| Instruction control | Coming soon | Coming soon |
+| Action control | Coming soon | Coming soon |
+| Camera control | Coming soon | Coming soon |
+| Multi-view generation | Coming soon | Coming soon |
+| Multi-view generation with vehicle trajectory control | Coming soon | Coming soon |

cosmos1/models/autoregressive/README.md ADDED Viewed

	@@ -0,0 +1,427 @@

+# Cosmos Autoregressive-based World Foundation Models
+## Table of Contents
+- [Getting Started](#getting-started)
+  - [Set Up Docker Environment](#set-up-docker-environment)
+  - [Download Checkpoints](#download-checkpoints)
+- [Usage](#usage)
+  - [Model Types](#model-types)
+  - [Single and Batch Generation](#single-and-batch-generation)
+  - [Sample Commands](#sample-commands)
+    - [Base Models (4B/12B)](#base-basepy-4b-and-12b)
+    - [Video2World Models (5B/13B)](#video2world-video2worldpy-5b-and-13b)
+  - [Arguments](#arguments)
+    - [Common Parameters](#common-parameters)
+    - [Base Specific Parameters](#base-specific-parameters)
+    - [Video2World Specific Parameters](#video2world-specific-parameters)
+  - [Safety Features](#safety-features)
+This page details the steps for using the Cosmos autoregressive-based world foundation models.
+## Getting Started
+### Set Up Docker Environment
+Follow our [Installation Guide](../../../INSTALL.md) to set up the Docker environment. All commands on this page should be run inside Docker.
+### Download Checkpoints
+1. Generate a [Hugging Face](https://huggingface.co/settings/tokens) access token. Set the access token to 'Read' permission (default is 'Fine-grained').
+2. Log in to Hugging Face with the access token:
+```bash
+huggingface-cli login
+```
+3. Download the Cosmos model weights from [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6):
+```bash
+PYTHONPATH=$(pwd) python cosmos1/scripts/download_autoregressive.py --model_sizes 4B 5B 12B 13B
+```
+4. The downloaded files should be in the following structure:
+```
+checkpoints/
+├── Cosmos-1.0-Autoregressive-4B
+│   ├── model.pt
+│   └── config.json
+├── Cosmos-1.0-Autoregressive-5B-Video2World
+│   ├── model.pt
+│   └── config.json
+├── Cosmos-1.0-Autoregressive-12B
+│   ├── model.pt
+│   └── config.json
+├── Cosmos-1.0-Autoregressive-13B-Video2World
+│   ├── model.pt
+│   └── config.json
+├── Cosmos-1.0-Tokenizer-CV8x8x8
+│   ├── decoder.jit
+│   ├── encoder.jit
+│   └── mean_std.pt
+├── Cosmos-1.0-Tokenizer-DV8x16x16
+│   ├── decoder.jit
+│   └── encoder.jit
+├── Cosmos-1.0-Diffusion-7B-Decoder-DV8x16x16ToCV8x8x8
+│   ├── aux_vars.pt
+│   └── model.pt
+└── Cosmos-1.0-Guardrail
+    ├── aegis/
+    ├── blocklist/
+    ├── face_blur_filter/
+    └── video_content_safety_filter/
+```
+## Usage
+### Model Types
+There are two model types available for autoregressive world generation:
+1. **Base**: Supports world generation from image/video input
+* Models: `Cosmos-1.0-Autoregressive-4B` and `Cosmos-1.0-Autoregressive-12B`
+* Inference script: [base.py](/cosmos1/models/autoregressive/inference/base.py)
+2. **Video2World**: Supports world generation from image/video input and text input
+* Models: `Cosmos-1.0-Autoregressive-5B-Video2World` and `Cosmos-1.0-Autoregressive-13B-Video2World`
+* Inference script: [video2world.py](/cosmos1/models/autoregressive/inference/video2world.py)
+Our models now support video extension up to 33 frames. Starting from either a single image or a 9-frame video input, they can generate the remaining frames to reach the 33-frame length (generating 32 or 24 frames, respectively).
+We have evaluated all eight possible configurations (4 models × 2 vision input types: image or video) using 100 test videos on physical AI topics. Below are the failure rates for each configuration:
+| Model                                      | Image input | Video input (9 frames) |
+|:------------------------------------------|:--------------:|:-------------------------:|
+| Cosmos-1.0-Autoregressive-4B              | 15%           | 1%                       |
+| Cosmos-1.0-Autoregressive-5B-Video2World  | 7%            | 2%                       |
+| Cosmos-1.0-Autoregressive-12B             | 2%            | 1%                       |
+| Cosmos-1.0-Autoregressive-13B-Video2World | 3%            | 0%                       |
+We define failure cases as videos with severe distortions, such as:
+* Sudden appearance of large unexpected objects
+* Video degrading to a single solid color
+Note that the following are not considered failures in our analysis:
+* Static video frames
+* Minor object distortions or artifacts
+### Single and Batch Generation
+We support both single and batch video generation.
+For generating a single video, `base` mode requires the input argument `--input_image_or_video_path` (image/video input), while `video2world` mode requires both `--input_image_or_video_path` (image/video input) and `--prompt` (text input).
+Note that our model only works with 1024x640 resolution videos. If the input image/video is not in this resolution, it will be resized and cropped.
+For generating a batch of videos, both `base` and `video2world` require `--batch_input_path` (path to a JSONL file). For `base`, the JSONL file should contain one visual input per line in the following format, where each line must contain a "visual_input" field:
+```json
+{"visual_input": "path/to/video1.mp4"}
+{"visual_input": "path/to/video2.mp4"}
+```
+For `video2world`, each line in the JSONL file must contain both "prompt" and "visual_input" fields:
+```json
+{"prompt": "prompt1", "visual_input": "path/to/video1.mp4"}
+{"prompt": "prompt2", "visual_input": "path/to/video2.mp4"}
+```
+### Sample Commands
+There are two main demo scripts for autoregressive world generation: `base.py` and `video2world.py`. Below you will find sample commands for single and batch generation, as well as commands for running with low-memory GPUs using model offloading. We also provide a memory usage table comparing different offloading strategies to help with configuration.
+#### Base (base.py): 4B and 12B
+Generates world from image/video input.
+The `input_type` argument can be either `video` or `image`. We have tuned the sampling parameters `top_p` and `temperature` to achieve the best performance. Please use the provided values in the command examples.
+Note that the command examples below all use video input. If you want to use image input, please change the `input_type` to `image`.
+##### Single Generation
+```bash
+# Example using 4B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --video_save_name=Cosmos-1.0-Autoregressive-4B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
+    --top_p=0.8 \
+    --temperature=1.0
+# Example for low-memory GPUs using 4B model with model offloading
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --video_save_name=Cosmos-1.0-Autoregressive-4B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
+    --top_p=0.8 \
+    --temperature=1.0 \
+    --offload_guardrail_models \
+    --offload_diffusion_decoder \
+    --offload_ar_model \
+    --offload_tokenizer
+# Example using 12B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --video_save_name=Cosmos-1.0-Autoregressive-12B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
+    --top_p=0.9 \
+    --temperature=1.0
+# Example for low-memory GPUs using 12B model with model offloading
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --video_save_name=Cosmos-1.0-Autoregressive-12B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
+    --top_p=0.9 \
+    --temperature=1.0 \
+    --offload_guardrail_models \
+    --offload_diffusion_decoder \
+    --offload_ar_model \
+    --offload_tokenizer
+```
+##### Batch Generation
+```bash
+# Example using 4B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl \
+    --video_save_folder=outputs/Cosmos-1.0-Autoregressive-4B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
+    --top_p=0.8 \
+    --temperature=1.0
+# Example using 12B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl \
+    --video_save_folder=outputs/Cosmos-1.0-Autoregressive-12B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
+    --top_p=0.9 \
+    --temperature=1.0
+```
+##### Example Output
+Here is an example output video generated using base.py with image input, using `Cosmos-1.0-Autoregressive-12B`:
+<video src="https://github.com/user-attachments/assets/634403a5-1873-42d7-8dd0-eb7fb4ac8cf4">
+  Your browser does not support the video tag.
+</video>
+The input image used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.jpg`. The image is from [BDD dataset](http://bdd-data.berkeley.edu/).
+Here is an example output video generated using base.py with 9-frame video input, using `Cosmos-1.0-Autoregressive-12B`:
+<video src="https://github.com/user-attachments/assets/1a3ff099-87d7-41e8-b149-a25cfcd4f40b">
+  Your browser does not support the video tag.
+</video>
+The input video used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.mp4`.
+##### Inference Time and GPU Memory Usage
+These numbers may vary based on system specifications and are provided for reference only.
+| Offloading Strategy | Cosmos-1.0-Autoregressive-4B | Cosmos-1.0-Autoregressive-12B |
+|-------------|---------|---------|
+| No offloading | 31.3 GB | 47.5 GB |
+| Guardrails | 28.9 GB | 45.2 GB |
+| Guardrails & Diffusion decoder | 28.5 GB | 43.1 GB |
+| Guardrails & Diffusion decoder & Tokenizer | 27.3 GB | 42.9 GB |
+| Guardrails & Diffusion decoder & Tokenizer & AR model | 18.7 GB | 27.4 GB |
+End-to-end inference runtime on one H100 without offloading and after model initialization:
+| Cosmos-1.0-Autoregressive-4B | Cosmos-1.0-Autoregressive-12B |
+|---------|---------|
+| ~62 seconds | ~119 seconds |
+#### Video2World (video2world.py): 5B and 13B
+Generates world from image/video and text input.
+The `input_type` argument can be either `text_and_video` or `text_and_image`. We have tuned the sampling parameters `top_p` and `temperature` to achieve the best performance. Please use the provided values in the command examples.
+Note that the command examples below all use video input. If you want to use image input, please change the `input_type` to `text_and_image`.
+##### Single Generation
+```bash
+# Example using 5B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
+    --video_save_name=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --top_p=0.7 \
+    --temperature=1.0
+# Example for low-memory GPUs using 5B model with model offloading
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
+    --video_save_name=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --top_p=0.7 \
+    --temperature=1.0 \
+    --offload_guardrail_models \
+    --offload_diffusion_decoder \
+    --offload_ar_model \
+    --offload_tokenizer \
+    --offload_text_encoder_model
+# Example using 13B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
+    --video_save_name=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --top_p=0.8 \
+    --temperature=1.0 \
+    --offload_guardrail_models
+# Example for low-memory GPUs using 13B model with model offloading
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
+    --video_save_name=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --top_p=0.8 \
+    --temperature=1.0 \
+    --offload_guardrail_models \
+    --offload_diffusion_decoder \
+    --offload_ar_model \
+    --offload_tokenizer \
+    --offload_text_encoder_model
+```
+##### Batch Generation
+```bash
+# Example using 5B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl \
+    --video_save_folder=outputs/Cosmos-1.0-Autoregressive-5B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --top_p=0.7 \
+    --temperature=1.0
+# Example using 13B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl \
+    --video_save_folder=outputs/Cosmos-1.0-Autoregressive-13B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --top_p=0.8 \
+    --temperature=1.0 \
+    --offload_guardrail_models
+```
+##### Example Output
+Here is an example output video generated using video2world.py with image input, using `Cosmos-1.0-Autoregressive-13B-Video2World`:
+<video src="https://github.com/user-attachments/assets/869f3b81-fabd-462e-a545-c04cdd9c1d22">
+  Your browser does not support the video tag.
+</video>
+The input image used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.jpg`. The prompt for generating the video is:
+```
+A driving video captures a serene urban street scene on a sunny day. The camera is mounted on the dashboard of a moving vehicle, providing a first-person perspective as it travels down a two-lane road. The street is lined with parked cars on both sides, predominantly black and silver sedans and SUVs. The road is flanked by a mix of residential and commercial buildings, with a prominent red-brick building on the left side, featuring multiple windows and a flat roof. The sky is clear with a few scattered clouds, casting soft shadows on the street. Trees with lush green foliage line the right side of the road, providing a natural contrast to the urban environment. The camera remains steady, maintaining a consistent forward motion, suggesting a leisurely drive. Traffic is light, with a few vehicles moving in the opposite direction, including a black sedan and a yellow taxi. Street signs are visible, including a no-parking sign on the right. The overall atmosphere is calm and peaceful, with no pedestrians visible, emphasizing the focus on the drive and the surrounding urban landscape.
+```
+Here is an example output video generated using video2world.py with 9-frame video input, using `Cosmos-1.0-Autoregressive-13B-Video2World`:
+<video src="https://github.com/user-attachments/assets/81840e1c-624b-4b01-9240-ab7db3722e58">
+  Your browser does not support the video tag.
+</video>
+The input video used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.mp4`. The prompt for generating the video is:
+```
+A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.
+```
+##### Inference Time and GPU Memory Usage
+These numbers may vary based on system specifications and are provided for reference only.
+| Offloading Strategy | Cosmos-1.0-Autoregressive-5B-Video2World | Cosmos-1.0-Autoregressive-13B-Video2World |
+|-------------|---------|---------|
+| No offloading | 66.2 GB | > 80 GB |
+| Guardrails | 58.7 GB | 76.6 GB |
+| Guardrails & T5 encoder | 41.3 GB | 58.0 GB |
+| Guardrails & T5 encoder & Diffusion decoder | 29.0 GB | 46.9 GB |
+| Guardrails & T5 encoder & Diffusion decoder & Tokenizer | 28.8 GB | 46.7 GB |
+| Guardrails & T5 encoder & Diffusion decoder & Tokenizer & AR model | 21.1 GB | 30.9 GB |
+End-to-end inference runtime on one H100 with no offloading for 5B model and guardrail offloading for 13B, after model initialization:
+| Cosmos-1.0-Autoregressive-5B-Video2World | Cosmos-1.0-Autoregressive-13B-Video2World |
+|---------|---------|
+| ~73 seconds | ~150 seconds |
+### Arguments
+#### Common Parameters
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--checkpoint_dir` | Directory containing model weights | "checkpoints" |
+| `--video_save_name` | Output video filename for single video generation | "output" |
+| `--video_save_folder` | Folder where all output videos are stored | "outputs/" |
+| `--input_image_or_video_path` | Input image or video path. Required for single video generation | None |
+| `--batch_input_path` | Folder containing input images or videos. Required for batch video generation | None |
+| `--num_input_frames` | Number of input frames to use for Video2World prediction | 9 |
+| `--temperature` | Temperature used while sampling | 1.0 (recommend using values in sample commands provided) |
+| `--top_p` | Top-p value for top-p sampling | 0.8 (recommend using values in sample commands provided) |
+| `--seed` | Random seed | 0 |
+| `--disable_diffusion_decoder` | When set to True, use discrete tokenizer to decode discrete tokens to video. Otherwise, use diffusion decoder to decode video | False |
+| `--offload_guardrail_models` | Offload guardrail models after inference, used for low-memory GPUs | False |
+| `--offload_diffusion_decoder` | Offload diffusion decoder after inference, used for low-memory GPUs | False |
+| `--offload_ar_model` | Offload AR model after inference, used for low-memory GPUs | False |
+| `--offload_prompt_upsampler` | Offload prompt upsampler after inference, used for low-memory GPUs | False |
+#### Base Specific Parameters
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--ar_model_dir` | Directory containing AR model weight | "Cosmos-1.0-Autoregressive-4B" |
+| `--input_type` | Input type, either `video` or `image` | "video" |
+#### Video2World Specific Parameters
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--ar_model_dir` | Directory containing AR model weight | "Cosmos-1.0-Autoregressive-4B" |
+| `--input_type` | Input type, either `text_and_video` or `text_and_image` | "text_and_video" |
+| `--prompt` | Text prompt for single video generation. Required for single video generation | None |
+| `--input_prompts_path` | Path to JSONL file for batch video generation. Required for batch video generation | None |
+| `--offload_text_encoder_model` | Offload text encoder after inference, used for low-memory GPUs | False |
+### Safety Features
+The model uses a built-in safety guardrail system that cannot be disabled. Generating human faces is not allowed and will be blurred by the guardrail.
+For more information, check out the [Cosmos Guardrail Documentation](../guardrail/README.md).

cosmos1/models/autoregressive/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos1/models/autoregressive/assets/nemo/finetuned_result.mp4 ADDED Viewed

Binary file (193 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4 ADDED Viewed

Binary file (299 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4 ADDED Viewed

Binary file (222 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4 ADDED Viewed

Binary file (511 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4 ADDED Viewed

Binary file (461 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4 ADDED Viewed

Binary file (331 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4 ADDED Viewed

Binary file (282 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4 ADDED Viewed

Binary file (289 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4 ADDED Viewed

Binary file (170 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4 ADDED Viewed

Binary file (188 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4 ADDED Viewed

Binary file (174 kB). View file