Spaces:

MohamedRashad
/

PaintsUndo

Running on Zero

App Files Files Community

MohamedRashad commited on Jul 9, 2024

Commit

6dd488f

1 Parent(s): 42373a7

Upload code

Browse files

Files changed (22) hide show

.gitignore +168 -0
LICENSE +201 -0
diffusers_helper/cat_cond.py +24 -0
diffusers_helper/code_cond.py +34 -0
diffusers_helper/k_diffusion.py +145 -0
diffusers_helper/utils.py +136 -0
diffusers_vdm/attention.py +385 -0
diffusers_vdm/basics.py +148 -0
diffusers_vdm/dynamic_tsnr_sampler.py +177 -0
diffusers_vdm/improved_clip_vision.py +58 -0
diffusers_vdm/pipeline.py +188 -0
diffusers_vdm/projection.py +160 -0
diffusers_vdm/unet.py +650 -0
diffusers_vdm/utils.py +43 -0
diffusers_vdm/vae.py +826 -0
gradio_app.py +324 -0
imgs/1.jpg +0 -0
imgs/2.jpg +0 -0
imgs/3.jpg +0 -0
memory_management.py +67 -0
requirements.txt +19 -0
wd14tagger.py +105 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+hf_token.txt
+hf_download/
+results/
+*.csv
+*.onnx
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

diffusers_helper/cat_cond.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+def unet_add_concat_conds(unet, new_channels=4):
+    with torch.no_grad():
+        new_conv_in = torch.nn.Conv2d(4 + new_channels, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
+        new_conv_in.weight.zero_()
+        new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
+        new_conv_in.bias = unet.conv_in.bias
+        unet.conv_in = new_conv_in
+    unet_original_forward = unet.forward
+    def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
+        cross_attention_kwargs = {k: v for k, v in kwargs['cross_attention_kwargs'].items()}
+        c_concat = cross_attention_kwargs.pop('concat_conds')
+        kwargs['cross_attention_kwargs'] = cross_attention_kwargs
+        c_concat = torch.cat([c_concat] * (sample.shape[0] // c_concat.shape[0]), dim=0).to(sample)
+        new_sample = torch.cat([sample, c_concat], dim=1)
+        return unet_original_forward(new_sample, timestep, encoder_hidden_states, **kwargs)
+    unet.forward = hooked_unet_forward
+    return

diffusers_helper/code_cond.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+def unet_add_coded_conds(unet, added_number_count=1):
+    unet.add_time_proj = Timesteps(256, True, 0)
+    unet.add_embedding = TimestepEmbedding(256 * added_number_count, 1280)
+    def get_aug_embed(emb, encoder_hidden_states, added_cond_kwargs):
+        coded_conds = added_cond_kwargs.get("coded_conds")
+        batch_size = coded_conds.shape[0]
+        time_embeds = unet.add_time_proj(coded_conds.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb)
+        aug_emb = unet.add_embedding(time_embeds)
+        return aug_emb
+    unet.get_aug_embed = get_aug_embed
+    unet_original_forward = unet.forward
+    def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
+        cross_attention_kwargs = {k: v for k, v in kwargs['cross_attention_kwargs'].items()}
+        coded_conds = cross_attention_kwargs.pop('coded_conds')
+        kwargs['cross_attention_kwargs'] = cross_attention_kwargs
+        coded_conds = torch.cat([coded_conds] * (sample.shape[0] // coded_conds.shape[0]), dim=0).to(sample.device)
+        kwargs['added_cond_kwargs'] = dict(coded_conds=coded_conds)
+        return unet_original_forward(sample, timestep, encoder_hidden_states, **kwargs)
+    unet.forward = hooked_unet_forward
+    return

diffusers_helper/k_diffusion.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+@torch.no_grad()
+def sample_dpmpp_2m(model, x, sigmas, extra_args=None, callback=None, progress_tqdm=None):
+    """DPM-Solver++(2M)."""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    sigma_fn = lambda t: t.neg().exp()
+    t_fn = lambda sigma: sigma.log().neg()
+    old_denoised = None
+    bar = tqdm if progress_tqdm is None else progress_tqdm
+    for i in bar(range(len(sigmas) - 1)):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
+        h = t_next - t
+        if old_denoised is None or sigmas[i + 1] == 0:
+            x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised
+        else:
+            h_last = t - t_fn(sigmas[i - 1])
+            r = h_last / h
+            denoised_d = (1 + 1 / (2 * r)) * denoised - (1 / (2 * r)) * old_denoised
+            x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised_d
+        old_denoised = denoised
+    return x
+class KModel:
+    def __init__(self, unet, timesteps=1000, linear_start=0.00085, linear_end=0.012, linear=False):
+        if linear:
+            betas = torch.linspace(linear_start, linear_end, timesteps, dtype=torch.float64)
+        else:
+            betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, timesteps, dtype=torch.float64) ** 2
+        alphas = 1. - betas
+        alphas_cumprod = torch.tensor(np.cumprod(alphas, axis=0), dtype=torch.float32)
+        self.sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        self.log_sigmas = self.sigmas.log()
+        self.sigma_data = 1.0
+        self.unet = unet
+        return
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+    def timestep(self, sigma):
+        log_sigma = sigma.log()
+        dists = log_sigma.to(self.log_sigmas.device) - self.log_sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape).to(sigma.device)
+    def get_sigmas_karras(self, n, rho=7.):
+        ramp = torch.linspace(0, 1, n)
+        min_inv_rho = self.sigma_min ** (1 / rho)
+        max_inv_rho = self.sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return torch.cat([sigmas, sigmas.new_zeros([1])])
+    def __call__(self, x, sigma, **extra_args):
+        x_ddim_space = x / (sigma[:, None, None, None] ** 2 + self.sigma_data ** 2) ** 0.5
+        x_ddim_space = x_ddim_space.to(dtype=self.unet.dtype)
+        t = self.timestep(sigma)
+        cfg_scale = extra_args['cfg_scale']
+        eps_positive = self.unet(x_ddim_space, t, return_dict=False, **extra_args['positive'])[0]
+        eps_negative = self.unet(x_ddim_space, t, return_dict=False, **extra_args['negative'])[0]
+        noise_pred = eps_negative + cfg_scale * (eps_positive - eps_negative)
+        return x - noise_pred * sigma[:, None, None, None]
+class KDiffusionSampler:
+    def __init__(self, unet, **kwargs):
+        self.unet = unet
+        self.k_model = KModel(unet=unet, **kwargs)
+    @torch.inference_mode()
+    def __call__(
+            self,
+            initial_latent = None,
+            strength = 1.0,
+            num_inference_steps = 25,
+            guidance_scale = 5.0,
+            batch_size = 1,
+            generator = None,
+            prompt_embeds = None,
+            negative_prompt_embeds = None,
+            cross_attention_kwargs = None,
+            same_noise_in_batch = False,
+            progress_tqdm = None,
+    ):
+        device = self.unet.device
+        # Sigmas
+        sigmas = self.k_model.get_sigmas_karras(int(num_inference_steps/strength))
+        sigmas = sigmas[-(num_inference_steps + 1):].to(device)
+        # Initial latents
+        if same_noise_in_batch:
+            noise = torch.randn(initial_latent.shape, generator=generator, device=device, dtype=self.unet.dtype).repeat(batch_size, 1, 1, 1)
+            initial_latent = initial_latent.repeat(batch_size, 1, 1, 1).to(device=device, dtype=self.unet.dtype)
+        else:
+            initial_latent = initial_latent.repeat(batch_size, 1, 1, 1).to(device=device, dtype=self.unet.dtype)
+            noise = torch.randn(initial_latent.shape, generator=generator, device=device, dtype=self.unet.dtype)
+        latents = initial_latent + noise * sigmas[0].to(initial_latent)
+        # Batch
+        latents = latents.to(device)
+        prompt_embeds = prompt_embeds.repeat(batch_size, 1, 1).to(device)
+        negative_prompt_embeds = negative_prompt_embeds.repeat(batch_size, 1, 1).to(device)
+        # Feeds
+        sampler_kwargs = dict(
+            cfg_scale=guidance_scale,
+            positive=dict(
+                encoder_hidden_states=prompt_embeds,
+                cross_attention_kwargs=cross_attention_kwargs
+            ),
+            negative=dict(
+                encoder_hidden_states=negative_prompt_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        )
+        # Sample
+        results = sample_dpmpp_2m(self.k_model, latents, sigmas, extra_args=sampler_kwargs, progress_tqdm=progress_tqdm)
+        return results

diffusers_helper/utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import json
+import random
+import glob
+import torch
+import einops
+import torchvision
+import safetensors.torch as sf
+def write_to_json(data, file_path):
+    temp_file_path = file_path + ".tmp"
+    with open(temp_file_path, 'wt', encoding='utf-8') as temp_file:
+        json.dump(data, temp_file, indent=4)
+    os.replace(temp_file_path, file_path)
+    return
+def read_from_json(file_path):
+    with open(file_path, 'rt', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+def get_active_parameters(m):
+    return {k:v for k, v in m.named_parameters() if v.requires_grad}
+def cast_training_params(m, dtype=torch.float32):
+    for param in m.parameters():
+        if param.requires_grad:
+            param.data = param.to(dtype)
+    return
+def set_attr_recursive(obj, attr, value):
+    attrs = attr.split(".")
+    for name in attrs[:-1]:
+        obj = getattr(obj, name)
+    setattr(obj, attrs[-1], value)
+    return
+@torch.no_grad()
+def batch_mixture(a, b, probability_a=0.5, mask_a=None):
+    assert a.shape == b.shape, "Tensors must have the same shape"
+    batch_size = a.size(0)
+    if mask_a is None:
+        mask_a = torch.rand(batch_size) < probability_a
+    mask_a = mask_a.to(a.device)
+    mask_a = mask_a.reshape((batch_size,) + (1,) * (a.dim() - 1))
+    result = torch.where(mask_a, a, b)
+    return result
+@torch.no_grad()
+def zero_module(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def load_last_state(model, folder='accelerator_output'):
+    file_pattern = os.path.join(folder, '**', 'model.safetensors')
+    files = glob.glob(file_pattern, recursive=True)
+    if not files:
+        print("No model.safetensors files found in the specified folder.")
+        return
+    newest_file = max(files, key=os.path.getmtime)
+    state_dict = sf.load_file(newest_file)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    if missing_keys:
+        print("Missing keys:", missing_keys)
+    if unexpected_keys:
+        print("Unexpected keys:", unexpected_keys)
+    print("Loaded model state from:", newest_file)
+    return
+def generate_random_prompt_from_tags(tags_str, min_length=3, max_length=32):
+    tags = tags_str.split(', ')
+    tags = random.sample(tags, k=min(random.randint(min_length, max_length), len(tags)))
+    prompt = ', '.join(tags)
+    return prompt
+def save_bcthw_as_mp4(x, output_filename, fps=10):
+    b, c, t, h, w = x.shape
+    per_row = b
+    for p in [6, 5, 4, 3, 2]:
+        if b % p == 0:
+            per_row = p
+            break
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, '(m n) c t h w -> t (m h) (n w) c', n=per_row)
+    torchvision.io.write_video(output_filename, x, fps=fps, video_codec='h264', options={'crf': '0'})
+    return x
+def save_bcthw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, 'b c t h w -> c (b h) (t w)')
+    torchvision.io.write_png(x, output_filename)
+    return output_filename
+def add_tensors_with_padding(tensor1, tensor2):
+    if tensor1.shape == tensor2.shape:
+        return tensor1 + tensor2
+    shape1 = tensor1.shape
+    shape2 = tensor2.shape
+    new_shape = tuple(max(s1, s2) for s1, s2 in zip(shape1, shape2))
+    padded_tensor1 = torch.zeros(new_shape)
+    padded_tensor2 = torch.zeros(new_shape)
+    padded_tensor1[tuple(slice(0, s) for s in shape1)] = tensor1
+    padded_tensor2[tuple(slice(0, s) for s in shape2)] = tensor2
+    result = padded_tensor1 + padded_tensor2
+    return result

diffusers_vdm/attention.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import torch
+import xformers.ops
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange, repeat
+from functools import partial
+from diffusers_vdm.basics import zero_module, checkpoint, default, make_temporal_window
+def sdp(q, k, v, heads):
+    b, _, C = q.shape
+    dim_head = C // heads
+    q, k, v = map(
+        lambda t: t.unsqueeze(3)
+        .reshape(b, t.shape[1], heads, dim_head)
+        .permute(0, 2, 1, 3)
+        .reshape(b * heads, t.shape[1], dim_head)
+        .contiguous(),
+        (q, k, v),
+    )
+    out = xformers.ops.memory_efficient_attention(q, k, v)
+    out = (
+        out.unsqueeze(0)
+        .reshape(b, heads, out.shape[1], dim_head)
+        .permute(0, 2, 1, 3)
+        .reshape(b, out.shape[1], heads * dim_head)
+    )
+    return out
+class RelativePosition(nn.Module):
+    """ https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py """
+    def __init__(self, num_units, max_relative_position):
+        super().__init__()
+        self.num_units = num_units
+        self.max_relative_position = max_relative_position
+        self.embeddings_table = nn.Parameter(torch.Tensor(max_relative_position * 2 + 1, num_units))
+        nn.init.xavier_uniform_(self.embeddings_table)
+    def forward(self, length_q, length_k):
+        device = self.embeddings_table.device
+        range_vec_q = torch.arange(length_q, device=device)
+        range_vec_k = torch.arange(length_k, device=device)
+        distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
+        distance_mat_clipped = torch.clamp(distance_mat, -self.max_relative_position, self.max_relative_position)
+        final_mat = distance_mat_clipped + self.max_relative_position
+        final_mat = final_mat.long()
+        embeddings = self.embeddings_table[final_mat]
+        return embeddings
+class CrossAttention(nn.Module):
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.,
+                 relative_position=False, temporal_length=None, video_length=None, image_cross_attention=False,
+                 image_cross_attention_scale=1.0, image_cross_attention_scale_learnable=False,
+                 text_context_len=77, temporal_window_for_spatial_self_attention=False):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+        self.is_temporal_attention = temporal_length is not None
+        self.relative_position = relative_position
+        if self.relative_position:
+            assert self.is_temporal_attention
+            self.relative_position_k = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+            self.relative_position_v = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+        self.video_length = video_length
+        self.temporal_window_for_spatial_self_attention = temporal_window_for_spatial_self_attention
+        self.temporal_window_type = 'prv'
+        self.image_cross_attention = image_cross_attention
+        self.image_cross_attention_scale = image_cross_attention_scale
+        self.text_context_len = text_context_len
+        self.image_cross_attention_scale_learnable = image_cross_attention_scale_learnable
+        if self.image_cross_attention:
+            self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False)
+            self.to_v_ip = nn.Linear(context_dim, inner_dim, bias=False)
+            if image_cross_attention_scale_learnable:
+                self.register_parameter('alpha', nn.Parameter(torch.tensor(0.)) )
+    def forward(self, x, context=None, mask=None):
+        if self.is_temporal_attention:
+            return self.temporal_forward(x, context=context, mask=mask)
+        else:
+            return self.spatial_forward(x, context=context, mask=mask)
+    def temporal_forward(self, x, context=None, mask=None):
+        assert mask is None, 'Attention mask not implemented!'
+        assert context is None, 'Temporal attention only supports self attention!'
+        q = self.to_q(x)
+        k = self.to_k(x)
+        v = self.to_v(x)
+        out = sdp(q, k, v, self.heads)
+        return self.to_out(out)
+    def spatial_forward(self, x, context=None, mask=None):
+        assert mask is None, 'Attention mask not implemented!'
+        spatial_self_attn = (context is None)
+        k_ip, v_ip, out_ip = None, None, None
+        q = self.to_q(x)
+        context = default(context, x)
+        if spatial_self_attn:
+            k = self.to_k(context)
+            v = self.to_v(context)
+            if self.temporal_window_for_spatial_self_attention:
+                k = make_temporal_window(k, t=self.video_length, method=self.temporal_window_type)
+                v = make_temporal_window(v, t=self.video_length, method=self.temporal_window_type)
+        elif self.image_cross_attention:
+            context, context_image = context
+            k = self.to_k(context)
+            v = self.to_v(context)
+            k_ip = self.to_k_ip(context_image)
+            v_ip = self.to_v_ip(context_image)
+        else:
+            raise NotImplementedError('Traditional prompt-only attention without IP-Adapter is illegal now.')
+        out = sdp(q, k, v, self.heads)
+        if k_ip is not None:
+            out_ip = sdp(q, k_ip, v_ip, self.heads)
+            if self.image_cross_attention_scale_learnable:
+                out = out + self.image_cross_attention_scale * out_ip * (torch.tanh(self.alpha) + 1)
+            else:
+                out = out + self.image_cross_attention_scale * out_ip
+        return self.to_out(out)
+class BasicTransformerBlock(nn.Module):
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
+                disable_self_attn=False, attention_cls=None, video_length=None, image_cross_attention=False, image_cross_attention_scale=1.0, image_cross_attention_scale_learnable=False, text_context_len=77):
+        super().__init__()
+        attn_cls = CrossAttention if attention_cls is None else attention_cls
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else None, video_length=video_length)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout, video_length=video_length, image_cross_attention=image_cross_attention, image_cross_attention_scale=image_cross_attention_scale, image_cross_attention_scale_learnable=image_cross_attention_scale_learnable,text_context_len=text_context_len)
+        self.image_cross_attention = image_cross_attention
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+    def forward(self, x, context=None, mask=None, **kwargs):
+        ## implementation tricks: because checkpointing doesn't support non-tensor (e.g. None or scalar) arguments
+        input_tuple = (x,)      ## should not be (x), otherwise *input_tuple will decouple x into multiple arguments
+        if context is not None:
+            input_tuple = (x, context)
+        if mask is not None:
+            forward_mask = partial(self._forward, mask=mask)
+            return checkpoint(forward_mask, (x,), self.parameters(), self.checkpoint)
+        return checkpoint(self._forward, input_tuple, self.parameters(), self.checkpoint)
+    def _forward(self, x, context=None, mask=None):
+        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None, mask=mask) + x
+        x = self.attn2(self.norm2(x), context=context, mask=mask) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data in spatial axis.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    NEW: use_linear for more efficiency instead of the 1x1 convs
+    """
+    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None,
+                 use_checkpoint=True, disable_self_attn=False, use_linear=False, video_length=None,
+                 image_cross_attention=False, image_cross_attention_scale_learnable=False):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        if not use_linear:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        attention_cls = None
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim,
+                n_heads,
+                d_head,
+                dropout=dropout,
+                context_dim=context_dim,
+                disable_self_attn=disable_self_attn,
+                checkpoint=use_checkpoint,
+                attention_cls=attention_cls,
+                video_length=video_length,
+                image_cross_attention=image_cross_attention,
+                image_cross_attention_scale_learnable=image_cross_attention_scale_learnable,
+                ) for d in range(depth)
+        ])
+        if not use_linear:
+            self.proj_out = zero_module(nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+        else:
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+    def forward(self, x, context=None, **kwargs):
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            x = block(x, context=context, **kwargs)
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
+class TemporalTransformer(nn.Module):
+    """
+    Transformer block for image-like data in temporal axis.
+    First, reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None,
+                 use_checkpoint=True, use_linear=False, only_self_att=True, causal_attention=False, causal_block_size=1,
+                 relative_position=False, temporal_length=None):
+        super().__init__()
+        self.only_self_att = only_self_att
+        self.relative_position = relative_position
+        self.causal_attention = causal_attention
+        self.causal_block_size = causal_block_size
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Conv1d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        if not use_linear:
+            self.proj_in = nn.Conv1d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        if relative_position:
+            assert(temporal_length is not None)
+            attention_cls = partial(CrossAttention, relative_position=True, temporal_length=temporal_length)
+        else:
+            attention_cls = partial(CrossAttention, temporal_length=temporal_length)
+        if self.causal_attention:
+            assert(temporal_length is not None)
+            self.mask = torch.tril(torch.ones([1, temporal_length, temporal_length]))
+        if self.only_self_att:
+            context_dim = None
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim,
+                n_heads,
+                d_head,
+                dropout=dropout,
+                context_dim=context_dim,
+                attention_cls=attention_cls,
+                checkpoint=use_checkpoint) for d in range(depth)
+        ])
+        if not use_linear:
+            self.proj_out = zero_module(nn.Conv1d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+        else:
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+    def forward(self, x, context=None):
+        b, c, t, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = rearrange(x, 'b c t h w -> (b h w) c t').contiguous()
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, 'bhw c t -> bhw t c').contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        temp_mask = None
+        if self.causal_attention:
+            # slice the from mask map
+            temp_mask = self.mask[:,:t,:t].to(x.device)
+        if temp_mask is not None:
+            mask = temp_mask.to(x.device)
+            mask = repeat(mask, 'l i j -> (l bhw) i j', bhw=b*h*w)
+        else:
+            mask = None
+        if self.only_self_att:
+            ## note: if no context is given, cross-attention defaults to self-attention
+            for i, block in enumerate(self.transformer_blocks):
+                x = block(x, mask=mask)
+            x = rearrange(x, '(b hw) t c -> b hw t c', b=b).contiguous()
+        else:
+            x = rearrange(x, '(b hw) t c -> b hw t c', b=b).contiguous()
+            context = rearrange(context, '(b t) l con -> b t l con', t=t).contiguous()
+            for i, block in enumerate(self.transformer_blocks):
+                # calculate each batch one by one (since number in shape could not greater then 65,535 for some package)
+                for j in range(b):
+                    context_j = repeat(
+                        context[j],
+                        't l con -> (t r) l con', r=(h * w) // t, t=t).contiguous()
+                    ## note: causal mask will not applied in cross-attention case
+                    x[j] = block(x[j], context=context_j)
+        if self.use_linear:
+            x = self.proj_out(x)
+            x = rearrange(x, 'b (h w) t c -> b c t h w', h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = rearrange(x, 'b hw t c -> (b hw) c t').contiguous()
+            x = self.proj_out(x)
+            x = rearrange(x, '(b h w) c t -> b c t h w', b=b, h=h, w=w).contiguous()
+        return x + x_in
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.net(x)

diffusers_vdm/basics.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+import torch
+import torch.nn as nn
+import einops
+from inspect import isfunction
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def nonlinearity(type='silu'):
+    if type == 'silu':
+        return nn.SiLU()
+    elif type == 'leaky_relu':
+        return nn.LeakyReLU()
+def normalization(channels, num_groups=32):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return nn.GroupNorm(num_groups, channels)
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def exists(val):
+    return val is not None
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def make_temporal_window(x, t, method):
+    assert method in ['roll', 'prv', 'first']
+    if method == 'roll':
+        m = einops.rearrange(x, '(b t) d c -> b t d c', t=t)
+        l = torch.roll(m, shifts=1, dims=1)
+        r = torch.roll(m, shifts=-1, dims=1)
+        recon = torch.cat([l, m, r], dim=2)
+        del l, m, r
+        recon = einops.rearrange(recon, 'b t d c -> (b t) d c')
+        return recon
+    if method == 'prv':
+        x = einops.rearrange(x, '(b t) d c -> b t d c', t=t)
+        prv = torch.cat([x[:, :1], x[:, :-1]], dim=1)
+        recon = torch.cat([x, prv], dim=2)
+        del x, prv
+        recon = einops.rearrange(recon, 'b t d c -> (b t) d c')
+        return recon
+    if method == 'first':
+        x = einops.rearrange(x, '(b t) d c -> b t d c', t=t)
+        prv = x[:, [0], :, :].repeat(1, t, 1, 1)
+        recon = torch.cat([x, prv], dim=2)
+        del x, prv
+        recon = einops.rearrange(recon, 'b t d c -> (b t) d c')
+        return recon
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        return torch.utils.checkpoint.checkpoint(func, *inputs, use_reentrant=False)
+    else:
+        return func(*inputs)

diffusers_vdm/dynamic_tsnr_sampler.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# everything that can improve v-prediction model
+# dynamic scaling + tsnr + beta modifier + dynamic cfg rescale + ...
+# written by lvmin at stanford 2024
+import torch
+import numpy as np
+from tqdm import tqdm
+from functools import partial
+from diffusers_vdm.basics import extract_into_tensor
+to_torch = partial(torch.tensor, dtype=torch.float32)
+def rescale_zero_terminal_snr(betas):
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = np.cumprod(alphas, axis=0)
+    alphas_bar_sqrt = np.sqrt(alphas_cumprod)
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].copy()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].copy()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = np.concatenate([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+    return betas
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+class SamplerDynamicTSNR(torch.nn.Module):
+    @torch.no_grad()
+    def __init__(self, unet, terminal_scale=0.7):
+        super().__init__()
+        self.unet = unet
+        self.is_v = True
+        self.n_timestep = 1000
+        self.guidance_rescale = 0.7
+        linear_start = 0.00085
+        linear_end = 0.012
+        betas = np.linspace(linear_start ** 0.5, linear_end ** 0.5, self.n_timestep, dtype=np.float64) ** 2
+        betas = rescale_zero_terminal_snr(betas)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod).to(unet.device))
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)).to(unet.device))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)).to(unet.device))
+        # Dynamic TSNR
+        turning_step = 400
+        scale_arr = np.concatenate([
+            np.linspace(1.0, terminal_scale, turning_step),
+            np.full(self.n_timestep - turning_step, terminal_scale)
+        ])
+        self.register_buffer('scale_arr', to_torch(scale_arr).to(unet.device))
+    def predict_eps_from_z_and_v(self, x_t, t, v):
+        return self.sqrt_alphas_cumprod[t] * v + self.sqrt_one_minus_alphas_cumprod[t] * x_t
+    def predict_start_from_z_and_v(self, x_t, t, v):
+        return self.sqrt_alphas_cumprod[t] * x_t - self.sqrt_one_minus_alphas_cumprod[t] * v
+    def q_sample(self, x0, t, noise):
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x0.shape) * x0 +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
+    def get_v(self, x0, t, noise):
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x0.shape) * noise -
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x0.shape) * x0)
+    def dynamic_x0_rescale(self, x0, t):
+        return x0 * extract_into_tensor(self.scale_arr, t, x0.shape)
+    @torch.no_grad()
+    def get_ground_truth(self, x0, noise, t):
+        x0 = self.dynamic_x0_rescale(x0, t)
+        xt = self.q_sample(x0, t, noise)
+        target = self.get_v(x0, t, noise) if self.is_v else noise
+        return xt, target
+    def get_uniform_trailing_steps(self, steps):
+        c = self.n_timestep / steps
+        ddim_timesteps = np.flip(np.round(np.arange(self.n_timestep, 0, -c))).astype(np.int64)
+        steps_out = ddim_timesteps - 1
+        return torch.tensor(steps_out, device=self.unet.device, dtype=torch.long)
+    @torch.no_grad()
+    def forward(self, latent_shape, steps, extra_args, progress_tqdm=None):
+        bar = tqdm if progress_tqdm is None else progress_tqdm
+        eta = 1.0
+        timesteps = self.get_uniform_trailing_steps(steps)
+        timesteps_prev = torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))
+        x = torch.randn(latent_shape, device=self.unet.device, dtype=self.unet.dtype)
+        alphas = self.alphas_cumprod[timesteps]
+        alphas_prev = self.alphas_cumprod[timesteps_prev]
+        scale_arr = self.scale_arr[timesteps]
+        scale_arr_prev = self.scale_arr[timesteps_prev]
+        sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
+        sigmas = eta * np.sqrt((1 - alphas_prev.cpu().numpy()) / (1 - alphas.cpu()) * (1 - alphas.cpu() / alphas_prev.cpu().numpy()))
+        s_in = x.new_ones((x.shape[0]))
+        s_x = x.new_ones((x.shape[0], ) + (1, ) * (x.ndim - 1))
+        for i in bar(range(len(timesteps))):
+            index = len(timesteps) - 1 - i
+            t = timesteps[index].item()
+            model_output = self.model_apply(x, t * s_in, **extra_args)
+            if self.is_v:
+                e_t = self.predict_eps_from_z_and_v(x, t, model_output)
+            else:
+                e_t = model_output
+            a_prev = alphas_prev[index].item() * s_x
+            sigma_t = sigmas[index].item() * s_x
+            if self.is_v:
+                pred_x0 = self.predict_start_from_z_and_v(x, t, model_output)
+            else:
+                a_t = alphas[index].item() * s_x
+                sqrt_one_minus_at = sqrt_one_minus_alphas[index].item() * s_x
+                pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            # dynamic rescale
+            scale_t = scale_arr[index].item() * s_x
+            prev_scale_t = scale_arr_prev[index].item() * s_x
+            rescale = (prev_scale_t / scale_t)
+            pred_x0 = pred_x0 * rescale
+            dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
+            noise = sigma_t * torch.randn_like(x)
+            x = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x
+    @torch.no_grad()
+    def model_apply(self, x, t, **extra_args):
+        x = x.to(device=self.unet.device, dtype=self.unet.dtype)
+        cfg_scale = extra_args['cfg_scale']
+        p = self.unet(x, t, **extra_args['positive'])
+        n = self.unet(x, t, **extra_args['negative'])
+        o = n + cfg_scale * (p - n)
+        o_better = rescale_noise_cfg(o, p, guidance_rescale=self.guidance_rescale)
+        return o_better

diffusers_vdm/improved_clip_vision.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# A CLIP Vision supporting arbitrary aspect ratios, by lllyasviel
+# The input range is changed to [-1, 1] rather than [0, 1] !!!! (same as VAE's range)
+import torch
+import types
+import einops
+from abc import ABCMeta
+from transformers import CLIPVisionModelWithProjection
+def preprocess(image):
+    mean = torch.tensor([0.48145466, 0.4578275, 0.40821073], device=image.device, dtype=image.dtype)[None, :, None, None]
+    std = torch.tensor([0.26862954, 0.26130258, 0.27577711], device=image.device, dtype=image.dtype)[None, :, None, None]
+    scale = 16 / min(image.shape[2], image.shape[3])
+    image = torch.nn.functional.interpolate(
+        image,
+        size=(14 * round(scale * image.shape[2]), 14 * round(scale * image.shape[3])),
+        mode="bicubic",
+        antialias=True
+    )
+    return (image - mean) / std
+def arbitrary_positional_encoding(p, H, W):
+    weight = p.weight
+    cls = weight[:1]
+    pos = weight[1:]
+    pos = einops.rearrange(pos, '(H W) C -> 1 C H W', H=16, W=16)
+    pos = torch.nn.functional.interpolate(pos, size=(H, W), mode="nearest")
+    pos = einops.rearrange(pos, '1 C H W -> (H W) C')
+    weight = torch.cat([cls, pos])[None]
+    return weight
+def improved_clipvision_embedding_forward(self, pixel_values):
+    pixel_values = pixel_values * 0.5 + 0.5
+    pixel_values = preprocess(pixel_values)
+    batch_size = pixel_values.shape[0]
+    target_dtype = self.patch_embedding.weight.dtype
+    patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+    B, C, H, W = patch_embeds.shape
+    patch_embeds = einops.rearrange(patch_embeds, 'B C H W -> B (H W) C')
+    class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+    embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+    embeddings = embeddings + arbitrary_positional_encoding(self.position_embedding, H, W)
+    return embeddings
+class ImprovedCLIPVisionModelWithProjection(CLIPVisionModelWithProjection, metaclass=ABCMeta):
+    def __init__(self, config):
+        super().__init__(config)
+        self.vision_model.embeddings.forward = types.MethodType(
+            improved_clipvision_embedding_forward,
+            self.vision_model.embeddings
+        )

diffusers_vdm/pipeline.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import torch
+import einops
+from diffusers import DiffusionPipeline
+from transformers import CLIPTextModel, CLIPTokenizer
+from huggingface_hub import snapshot_download
+from diffusers_vdm.vae import VideoAutoencoderKL
+from diffusers_vdm.projection import Resampler
+from diffusers_vdm.unet import UNet3DModel
+from diffusers_vdm.improved_clip_vision import ImprovedCLIPVisionModelWithProjection
+from diffusers_vdm.dynamic_tsnr_sampler import SamplerDynamicTSNR
+class LatentVideoDiffusionPipeline(DiffusionPipeline):
+    def __init__(self, tokenizer, text_encoder, image_encoder, vae, image_projection, unet, fp16=True, eval=True):
+        super().__init__()
+        self.loading_components = dict(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            image_encoder=image_encoder,
+            image_projection=image_projection
+        )
+        for k, v in self.loading_components.items():
+            setattr(self, k, v)
+        if fp16:
+            self.vae.half()
+            self.text_encoder.half()
+            self.unet.half()
+            self.image_encoder.half()
+            self.image_projection.half()
+        self.vae.requires_grad_(False)
+        self.text_encoder.requires_grad_(False)
+        self.image_encoder.requires_grad_(False)
+        self.vae.eval()
+        self.text_encoder.eval()
+        self.image_encoder.eval()
+        if eval:
+            self.unet.eval()
+            self.image_projection.eval()
+        else:
+            self.unet.train()
+            self.image_projection.train()
+    def to(self, *args, **kwargs):
+        for k, v in self.loading_components.items():
+            if hasattr(v, 'to'):
+                v.to(*args, **kwargs)
+        return self
+    def save_pretrained(self, save_directory, **kwargs):
+        for k, v in self.loading_components.items():
+            folder = os.path.join(save_directory, k)
+            os.makedirs(folder, exist_ok=True)
+            v.save_pretrained(folder)
+        return
+    @classmethod
+    def from_pretrained(cls, repo_id, fp16=True, eval=True, token=None):
+        local_folder = snapshot_download(repo_id=repo_id, token=token)
+        return cls(
+            tokenizer=CLIPTokenizer.from_pretrained(os.path.join(local_folder, "tokenizer")),
+            text_encoder=CLIPTextModel.from_pretrained(os.path.join(local_folder, "text_encoder")),
+            image_encoder=ImprovedCLIPVisionModelWithProjection.from_pretrained(os.path.join(local_folder, "image_encoder")),
+            vae=VideoAutoencoderKL.from_pretrained(os.path.join(local_folder, "vae")),
+            image_projection=Resampler.from_pretrained(os.path.join(local_folder, "image_projection")),
+            unet=UNet3DModel.from_pretrained(os.path.join(local_folder, "unet")),
+            fp16=fp16,
+            eval=eval
+        )
+    @torch.inference_mode()
+    def encode_cropped_prompt_77tokens(self, prompt: str):
+        cond_ids = self.tokenizer(prompt,
+                                  padding="max_length",
+                                  max_length=self.tokenizer.model_max_length,
+                                  truncation=True,
+                                  return_tensors="pt").input_ids.to(self.text_encoder.device)
+        cond = self.text_encoder(cond_ids, attention_mask=None).last_hidden_state
+        return cond
+    @torch.inference_mode()
+    def encode_clip_vision(self, frames):
+        b, c, t, h, w = frames.shape
+        frames = einops.rearrange(frames, 'b c t h w -> (b t) c h w')
+        clipvision_embed = self.image_encoder(frames).last_hidden_state
+        clipvision_embed = einops.rearrange(clipvision_embed, '(b t) d c -> b t d c', t=t)
+        return clipvision_embed
+    @torch.inference_mode()
+    def encode_latents(self, videos, return_hidden_states=True):
+        b, c, t, h, w = videos.shape
+        x = einops.rearrange(videos, 'b c t h w -> (b t) c h w')
+        encoder_posterior, hidden_states = self.vae.encode(x, return_hidden_states=return_hidden_states)
+        z = encoder_posterior.mode() * self.vae.scale_factor
+        z = einops.rearrange(z, '(b t) c h w -> b c t h w', b=b, t=t)
+        if not return_hidden_states:
+            return z
+        hidden_states = [einops.rearrange(h, '(b t) c h w -> b c t h w', b=b) for h in hidden_states]
+        hidden_states = [h[:, :, [0, -1], :, :] for h in hidden_states]  # only need first and last
+        return z, hidden_states
+    @torch.inference_mode()
+    def decode_latents(self, latents, hidden_states):
+        B, C, T, H, W = latents.shape
+        latents = einops.rearrange(latents, 'b c t h w -> (b t) c h w')
+        latents = latents.to(device=self.vae.device, dtype=self.vae.dtype) / self.vae.scale_factor
+        pixels = self.vae.decode(latents, ref_context=hidden_states, timesteps=T)
+        pixels = einops.rearrange(pixels, '(b t) c h w -> b c t h w', b=B, t=T)
+        return pixels
+    @torch.inference_mode()
+    def __call__(
+            self,
+            batch_size: int = 1,
+            steps: int = 50,
+            guidance_scale: float = 5.0,
+            positive_text_cond = None,
+            negative_text_cond = None,
+            positive_image_cond = None,
+            negative_image_cond = None,
+            concat_cond = None,
+            fs = 3,
+            progress_tqdm = None,
+    ):
+        unet_is_training = self.unet.training
+        if unet_is_training:
+            self.unet.eval()
+        device = self.unet.device
+        dtype = self.unet.dtype
+        dynamic_tsnr_model = SamplerDynamicTSNR(self.unet)
+        # Batch
+        concat_cond = concat_cond.repeat(batch_size, 1, 1, 1, 1).to(device=device, dtype=dtype)  # b, c, t, h, w
+        positive_text_cond = positive_text_cond.repeat(batch_size, 1, 1).to(concat_cond)  # b, f, c
+        negative_text_cond = negative_text_cond.repeat(batch_size, 1, 1).to(concat_cond)  # b, f, c
+        positive_image_cond = positive_image_cond.repeat(batch_size, 1, 1, 1).to(concat_cond)  # b, t, l, c
+        negative_image_cond = negative_image_cond.repeat(batch_size, 1, 1, 1).to(concat_cond)
+        if isinstance(fs, torch.Tensor):
+            fs = fs.repeat(batch_size, ).to(dtype=torch.long, device=device)  # b
+        else:
+            fs = torch.tensor([fs] * batch_size, dtype=torch.long, device=device)  # b
+        # Initial latents
+        latent_shape = concat_cond.shape
+        # Feeds
+        sampler_kwargs = dict(
+            cfg_scale=guidance_scale,
+            positive=dict(
+                context_text=positive_text_cond,
+                context_img=positive_image_cond,
+                fs=fs,
+                concat_cond=concat_cond
+            ),
+            negative=dict(
+                context_text=negative_text_cond,
+                context_img=negative_image_cond,
+                fs=fs,
+                concat_cond=concat_cond
+            )
+        )
+        # Sample
+        results = dynamic_tsnr_model(latent_shape, steps, extra_args=sampler_kwargs, progress_tqdm=progress_tqdm)
+        if unet_is_training:
+            self.unet.train()
+        return results

diffusers_vdm/projection.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+# and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
+# and https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/resampler.py
+import math
+import torch
+import einops
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+class ImageProjModel(nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds):
+        #embeds = image_embeds
+        embeds = image_embeds.type(list(self.proj.parameters())[0].dtype)
+        clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class Resampler(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+        video_length=16,
+        input_frames_length=2,
+    ):
+        super().__init__()
+        self.num_queries = num_queries
+        self.video_length = video_length
+        self.latents = nn.Parameter(torch.randn(1, num_queries * video_length, dim) / dim**0.5)
+        self.input_pos = nn.Parameter(torch.zeros(1, input_frames_length, 1, embedding_dim))
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+    def forward(self, x):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = x + self.input_pos
+        x = einops.rearrange(x, 'b ti d c -> b (ti d) c')
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        latents = self.norm_out(latents)
+        latents = einops.rearrange(latents, 'b (to l) c -> b to l c', to=self.video_length)
+        return latents
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

diffusers_vdm/unet.py ADDED Viewed

	@@ -0,0 +1,650 @@

+# https://github.com/AILab-CVC/VideoCrafter
+# https://github.com/Doubiiu/DynamiCrafter
+# https://github.com/ToonCrafter/ToonCrafter
+# Then edited by lllyasviel
+from functools import partial
+from abc import abstractmethod
+import torch
+import math
+import torch.nn as nn
+from einops import rearrange, repeat
+import torch.nn.functional as F
+from diffusers_vdm.basics import checkpoint
+from diffusers_vdm.basics import (
+    zero_module,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    normalization
+)
+from diffusers_vdm.attention import SpatialTransformer, TemporalTransformer
+from huggingface_hub import PyTorchModelHubMixin
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+    def forward(self, x, emb, context=None, batch_size=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb, batch_size=batch_size)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+            elif isinstance(layer, TemporalTransformer):
+                x = rearrange(x, '(b f) c h w -> b c f h w', b=batch_size)
+                x = layer(x, context)
+                x = rearrange(x, 'b c f h w -> (b f) c h w')
+            else:
+                x = layer(x)
+        return x
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode='nearest')
+        else:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    :param use_temporal_conv: if True, use the temporal convolution.
+    :param use_image_dataset: if True, the temporal parameters will not be optimized.
+    """
+    def __init__(
+            self,
+            channels,
+            emb_channels,
+            dropout,
+            out_channels=None,
+            use_scale_shift_norm=False,
+            dims=2,
+            use_checkpoint=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            use_temporal_conv=False,
+            tempspatial_aware=False
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.use_temporal_conv = use_temporal_conv
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(nn.Conv2d(self.out_channels, self.out_channels, 3, padding=1)),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+        if self.use_temporal_conv:
+            self.temopral_conv = TemporalConvBlock(
+                self.out_channels,
+                self.out_channels,
+                dropout=0.1,
+                spatial_aware=tempspatial_aware
+            )
+    def forward(self, x, emb, batch_size=None):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        input_tuple = (x, emb)
+        if batch_size:
+            forward_batchsize = partial(self._forward, batch_size=batch_size)
+            return checkpoint(forward_batchsize, input_tuple, self.parameters(), self.use_checkpoint)
+        return checkpoint(self._forward, input_tuple, self.parameters(), self.use_checkpoint)
+    def _forward(self, x, emb, batch_size=None):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        h = self.skip_connection(x) + h
+        if self.use_temporal_conv and batch_size:
+            h = rearrange(h, '(b t) c h w -> b c t h w', b=batch_size)
+            h = self.temopral_conv(h)
+            h = rearrange(h, 'b c t h w -> (b t) c h w')
+        return h
+class TemporalConvBlock(nn.Module):
+    """
+    Adapted from modelscope: https://github.com/modelscope/modelscope/blob/master/modelscope/models/multi_modal/video_synthesis/unet_sd.py
+    """
+    def __init__(self, in_channels, out_channels=None, dropout=0.0, spatial_aware=False):
+        super(TemporalConvBlock, self).__init__()
+        if out_channels is None:
+            out_channels = in_channels
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        th_kernel_shape = (3, 1, 1) if not spatial_aware else (3, 3, 1)
+        th_padding_shape = (1, 0, 0) if not spatial_aware else (1, 1, 0)
+        tw_kernel_shape = (3, 1, 1) if not spatial_aware else (3, 1, 3)
+        tw_padding_shape = (1, 0, 0) if not spatial_aware else (1, 0, 1)
+        # conv layers
+        self.conv1 = nn.Sequential(
+            nn.GroupNorm(32, in_channels), nn.SiLU(),
+            nn.Conv3d(in_channels, out_channels, th_kernel_shape, padding=th_padding_shape))
+        self.conv2 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, tw_kernel_shape, padding=tw_padding_shape))
+        self.conv3 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, th_kernel_shape, padding=th_padding_shape))
+        self.conv4 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, tw_kernel_shape, padding=tw_padding_shape))
+        # zero out the last layer params,so the conv block is identity
+        nn.init.zeros_(self.conv4[-1].weight)
+        nn.init.zeros_(self.conv4[-1].bias)
+    def forward(self, x):
+        identity = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        return identity + x
+class UNet3DModel(nn.Module, PyTorchModelHubMixin):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: in_channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+    def __init__(self,
+                 in_channels,
+                 model_channels,
+                 out_channels,
+                 num_res_blocks,
+                 attention_resolutions,
+                 dropout=0.0,
+                 channel_mult=(1, 2, 4, 8),
+                 conv_resample=True,
+                 dims=2,
+                 context_dim=None,
+                 use_scale_shift_norm=False,
+                 resblock_updown=False,
+                 num_heads=-1,
+                 num_head_channels=-1,
+                 transformer_depth=1,
+                 use_linear=False,
+                 temporal_conv=False,
+                 tempspatial_aware=False,
+                 temporal_attention=True,
+                 use_relative_position=True,
+                 use_causal_attention=False,
+                 temporal_length=None,
+                 addition_attention=False,
+                 temporal_selfatt_only=True,
+                 image_cross_attention=False,
+                 image_cross_attention_scale_learnable=False,
+                 default_fs=4,
+                 fs_condition=False,
+                 ):
+        super(UNet3DModel, self).__init__()
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.temporal_attention = temporal_attention
+        time_embed_dim = model_channels * 4
+        self.use_checkpoint = use_checkpoint = False  # moved to self.enable_gradient_checkpointing()
+        temporal_self_att_only = True
+        self.addition_attention = addition_attention
+        self.temporal_length = temporal_length
+        self.image_cross_attention = image_cross_attention
+        self.image_cross_attention_scale_learnable = image_cross_attention_scale_learnable
+        self.default_fs = default_fs
+        self.fs_condition = fs_condition
+        ## Time embedding blocks
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if fs_condition:
+            self.fps_embedding = nn.Sequential(
+                linear(model_channels, time_embed_dim),
+                nn.SiLU(),
+                linear(time_embed_dim, time_embed_dim),
+            )
+            nn.init.zeros_(self.fps_embedding[-1].weight)
+            nn.init.zeros_(self.fps_embedding[-1].bias)
+        ## Input Block
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(conv_nd(dims, in_channels, model_channels, 3, padding=1))
+            ]
+        )
+        if self.addition_attention:
+            self.init_attn = TimestepEmbedSequential(
+                TemporalTransformer(
+                    model_channels,
+                    n_heads=8,
+                    d_head=num_head_channels,
+                    depth=transformer_depth,
+                    context_dim=context_dim,
+                    use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only,
+                    causal_attention=False, relative_position=use_relative_position,
+                    temporal_length=temporal_length))
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(ch, time_embed_dim, dropout,
+                             out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint,
+                             use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                             use_temporal_conv=temporal_conv
+                             )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        SpatialTransformer(ch, num_heads, dim_head,
+                                           depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                           use_checkpoint=use_checkpoint, disable_self_attn=False,
+                                           video_length=temporal_length,
+                                           image_cross_attention=self.image_cross_attention,
+                                           image_cross_attention_scale_learnable=self.image_cross_attention_scale_learnable,
+                                           )
+                    )
+                    if self.temporal_attention:
+                        layers.append(
+                            TemporalTransformer(ch, num_heads, dim_head,
+                                                depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                                use_checkpoint=use_checkpoint, only_self_att=temporal_self_att_only,
+                                                causal_attention=use_causal_attention,
+                                                relative_position=use_relative_position,
+                                                temporal_length=temporal_length
+                                                )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(ch, time_embed_dim, dropout,
+                                 out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint,
+                                 use_scale_shift_norm=use_scale_shift_norm,
+                                 down=True
+                                 )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        layers = [
+            ResBlock(ch, time_embed_dim, dropout,
+                     dims=dims, use_checkpoint=use_checkpoint,
+                     use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                     use_temporal_conv=temporal_conv
+                     ),
+            SpatialTransformer(ch, num_heads, dim_head,
+                               depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                               use_checkpoint=use_checkpoint, disable_self_attn=False, video_length=temporal_length,
+                               image_cross_attention=self.image_cross_attention,
+                               image_cross_attention_scale_learnable=self.image_cross_attention_scale_learnable
+                               )
+        ]
+        if self.temporal_attention:
+            layers.append(
+                TemporalTransformer(ch, num_heads, dim_head,
+                                    depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                    use_checkpoint=use_checkpoint, only_self_att=temporal_self_att_only,
+                                    causal_attention=use_causal_attention, relative_position=use_relative_position,
+                                    temporal_length=temporal_length
+                                    )
+            )
+        layers.append(
+            ResBlock(ch, time_embed_dim, dropout,
+                     dims=dims, use_checkpoint=use_checkpoint,
+                     use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                     use_temporal_conv=temporal_conv
+                     )
+        )
+        ## Middle Block
+        self.middle_block = TimestepEmbedSequential(*layers)
+        ## Output Block
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(ch + ich, time_embed_dim, dropout,
+                             out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint,
+                             use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                             use_temporal_conv=temporal_conv
+                             )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        SpatialTransformer(ch, num_heads, dim_head,
+                                           depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                           use_checkpoint=use_checkpoint, disable_self_attn=False,
+                                           video_length=temporal_length,
+                                           image_cross_attention=self.image_cross_attention,
+                                           image_cross_attention_scale_learnable=self.image_cross_attention_scale_learnable
+                                           )
+                    )
+                    if self.temporal_attention:
+                        layers.append(
+                            TemporalTransformer(ch, num_heads, dim_head,
+                                                depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                                use_checkpoint=use_checkpoint, only_self_att=temporal_self_att_only,
+                                                causal_attention=use_causal_attention,
+                                                relative_position=use_relative_position,
+                                                temporal_length=temporal_length
+                                                )
+                        )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(ch, time_embed_dim, dropout,
+                                 out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint,
+                                 use_scale_shift_norm=use_scale_shift_norm,
+                                 up=True
+                                 )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    def forward(self, x, timesteps, context_text=None, context_img=None, concat_cond=None, fs=None, **kwargs):
+        b, _, t, _, _ = x.shape
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).type(x.dtype)
+        emb = self.time_embed(t_emb)
+        context_text = context_text.repeat_interleave(repeats=t, dim=0)
+        context_img = rearrange(context_img, 'b t l c -> (b t) l c')
+        context = (context_text, context_img)
+        emb = emb.repeat_interleave(repeats=t, dim=0)
+        if concat_cond is not None:
+            x = torch.cat([x, concat_cond], dim=1)
+        ## always in shape (b t) c h w, except for temporal layer
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        ## combine emb
+        if self.fs_condition:
+            if fs is None:
+                fs = torch.tensor(
+                    [self.default_fs] * b, dtype=torch.long, device=x.device)
+            fs_emb = timestep_embedding(fs, self.model_channels, repeat_only=False).type(x.dtype)
+            fs_embed = self.fps_embedding(fs_emb)
+            fs_embed = fs_embed.repeat_interleave(repeats=t, dim=0)
+            emb = emb + fs_embed
+        h = x
+        hs = []
+        for id, module in enumerate(self.input_blocks):
+            h = module(h, emb, context=context, batch_size=b)
+            if id == 0 and self.addition_attention:
+                h = self.init_attn(h, emb, context=context, batch_size=b)
+            hs.append(h)
+        h = self.middle_block(h, emb, context=context, batch_size=b)
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context=context, batch_size=b)
+        h = h.type(x.dtype)
+        y = self.out(h)
+        y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
+        return y
+    def enable_gradient_checkpointing(self, enable=True, verbose=False):
+        for k, v in self.named_modules():
+            if hasattr(v, 'checkpoint'):
+                v.checkpoint = enable
+                if verbose:
+                    print(f'{k}.checkpoint = {enable}')
+            if hasattr(v, 'use_checkpoint'):
+                v.use_checkpoint = enable
+                if verbose:
+                    print(f'{k}.use_checkpoint = {enable}')
+        return

diffusers_vdm/utils.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import cv2
+import torch
+import einops
+import torchvision
+def resize_and_center_crop(image, target_width, target_height, interpolation=cv2.INTER_AREA):
+    original_height, original_width = image.shape[:2]
+    k = max(target_height / original_height, target_width / original_width)
+    new_width = int(round(original_width * k))
+    new_height = int(round(original_height * k))
+    resized_image = cv2.resize(image, (new_width, new_height), interpolation=interpolation)
+    x_start = (new_width - target_width) // 2
+    y_start = (new_height - target_height) // 2
+    cropped_image = resized_image[y_start:y_start + target_height, x_start:x_start + target_width]
+    return cropped_image
+def save_bcthw_as_mp4(x, output_filename, fps=10):
+    b, c, t, h, w = x.shape
+    per_row = b
+    for p in [6, 5, 4, 3, 2]:
+        if b % p == 0:
+            per_row = p
+            break
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, '(m n) c t h w -> t (m h) (n w) c', n=per_row)
+    torchvision.io.write_video(output_filename, x, fps=fps, video_codec='h264', options={'crf': '1'})
+    return x
+def save_bcthw_as_png(x, output_filename):
+    os.makedirs(os.path.dirname(os.path.abspath(os.path.realpath(output_filename))), exist_ok=True)
+    x = torch.clamp(x.float(), -1., 1.) * 127.5 + 127.5
+    x = x.detach().cpu().to(torch.uint8)
+    x = einops.rearrange(x, 'b c t h w -> c (b h) (t w)')
+    torchvision.io.write_png(x, output_filename)
+    return output_filename

diffusers_vdm/vae.py ADDED Viewed

	@@ -0,0 +1,826 @@

+# video VAE with many components from lots of repos
+# collected by lvmin
+import torch
+import xformers.ops
+import torch.nn as nn
+from einops import rearrange, repeat
+from diffusers_vdm.basics import default, exists, zero_module, conv_nd, linear, normalization
+from diffusers_vdm.unet import Upsample, Downsample
+from huggingface_hub import PyTorchModelHubMixin
+def chunked_attention(q, k, v, batch_chunk=0):
+    # if batch_chunk > 0 and not torch.is_grad_enabled():
+    #     batch_size = q.size(0)
+    #     chunks = [slice(i, i + batch_chunk) for i in range(0, batch_size, batch_chunk)]
+    #
+    #     out_chunks = []
+    #     for chunk in chunks:
+    #         q_chunk = q[chunk]
+    #         k_chunk = k[chunk]
+    #         v_chunk = v[chunk]
+    #
+    #         out_chunk = torch.nn.functional.scaled_dot_product_attention(
+    #             q_chunk, k_chunk, v_chunk, attn_mask=None
+    #         )
+    #         out_chunks.append(out_chunk)
+    #
+    #     out = torch.cat(out_chunks, dim=0)
+    # else:
+    #     out = torch.nn.functional.scaled_dot_product_attention(
+    #         q, k, v, attn_mask=None
+    #     )
+    out = xformers.ops.memory_efficient_attention(q, k, v)
+    return out
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+def GroupNorm(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+class DiagonalGaussianDistribution:
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self, noise=None):
+        if noise is None:
+            noise = torch.randn(self.mean.shape)
+        x = self.mean + self.std * noise.to(device=self.parameters.device)
+        return x
+    def mode(self):
+        return self.mean
+class EncoderDownSampleBlock(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        self.in_channels = in_channels
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = GroupNorm(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = GroupNorm(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1, 2, 4, 8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, **kwargs):
+        super().__init__()
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(Attention(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = EncoderDownSampleBlock(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = Attention(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = GroupNorm(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2 * z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, return_hidden_states=False):
+        # timestep embedding
+        temb = None
+        # print(f'encoder-input={x.shape}')
+        # downsampling
+        hs = [self.conv_in(x)]
+        ## if we return hidden states for decoder usage, we will store them in a list
+        if return_hidden_states:
+            hidden_states = []
+        # print(f'encoder-conv in feat={hs[0].shape}')
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                # print(f'encoder-down feat={h.shape}')
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if return_hidden_states:
+                hidden_states.append(h)
+            if i_level != self.num_resolutions - 1:
+                # print(f'encoder-downsample (input)={hs[-1].shape}')
+                hs.append(self.down[i_level].downsample(hs[-1]))
+                # print(f'encoder-downsample (output)={hs[-1].shape}')
+        if return_hidden_states:
+            hidden_states.append(hs[0])
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        # print(f'encoder-mid1 feat={h.shape}')
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # print(f'encoder-mid2 feat={h.shape}')
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        # print(f'end feat={h.shape}')
+        if return_hidden_states:
+            return h, hidden_states
+        else:
+            return h
+class ConvCombiner(nn.Module):
+    def __init__(self, ch):
+        super().__init__()
+        self.conv = nn.Conv2d(ch, ch, 1, padding=0)
+        nn.init.zeros_(self.conv.weight)
+        nn.init.zeros_(self.conv.bias)
+    def forward(self, x, context):
+        ## x: b c h w, context: b c 2 h w
+        b, c, l, h, w = context.shape
+        bt, c, h, w = x.shape
+        context = rearrange(context, "b c l h w -> (b l) c h w")
+        context = self.conv(context)
+        context = rearrange(context, "(b l) c h w -> b c l h w", l=l)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=bt // b)
+        x[:, :, 0] = x[:, :, 0] + context[:, :, 0]
+        x[:, :, -1] = x[:, :, -1] + context[:, :, -1]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+class AttentionCombiner(nn.Module):
+    def __init__(
+            self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+        self.attention_op = None
+        self.norm = GroupNorm(query_dim)
+        nn.init.zeros_(self.to_out[0].weight)
+        nn.init.zeros_(self.to_out[0].bias)
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+    ):
+        bt, c, h, w = x.shape
+        h_ = self.norm(x)
+        h_ = rearrange(h_, "b c h w -> b (h w) c")
+        q = self.to_q(h_)
+        b, c, l, h, w = context.shape
+        context = rearrange(context, "b c l h w -> (b l) (h w) c")
+        k = self.to_k(context)
+        v = self.to_v(context)
+        t = bt // b
+        k = repeat(k, "(b l) d c -> (b t) (l d) c", l=l, t=t)
+        v = repeat(v, "(b l) d c -> (b t) (l d) c", l=l, t=t)
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        out = chunked_attention(
+            q, k, v, batch_chunk=1
+        )
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        out = self.to_out(out)
+        out = rearrange(out, "bt (h w) c -> bt c h w", h=h, w=w, c=c)
+        return x + out
+class Attention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = GroupNorm(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def attention(self, h_: torch.Tensor) -> torch.Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        B, C, H, W = q.shape
+        q, k, v = map(lambda x: rearrange(x, "b c h w -> b (h w) c"), (q, k, v))
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(B, t.shape[1], 1, C)
+            .permute(0, 2, 1, 3)
+            .reshape(B * 1, t.shape[1], C)
+            .contiguous(),
+            (q, k, v),
+        )
+        out = chunked_attention(
+            q, k, v, batch_chunk=1
+        )
+        out = (
+            out.unsqueeze(0)
+            .reshape(B, 1, out.shape[1], C)
+            .permute(0, 2, 1, 3)
+            .reshape(B, out.shape[1], C)
+        )
+        return rearrange(out, "b (h w) c -> b c h w", b=B, h=H, w=W, c=C)
+    def forward(self, x, **kwargs):
+        h_ = x
+        h_ = self.attention(h_)
+        h_ = self.proj_out(h_)
+        return x + h_
+class VideoDecoder(nn.Module):
+    def __init__(
+            self,
+            *,
+            ch,
+            out_ch,
+            ch_mult=(1, 2, 4, 8),
+            num_res_blocks,
+            attn_resolutions,
+            dropout=0.0,
+            resamp_with_conv=True,
+            in_channels,
+            resolution,
+            z_channels,
+            give_pre_end=False,
+            tanh_out=False,
+            use_linear_attn=False,
+            attn_level=[2, 3],
+            video_kernel_size=[3, 1, 1],
+            alpha: float = 0.0,
+            merge_strategy: str = "learned",
+            **kwargs,
+    ):
+        super().__init__()
+        self.video_kernel_size = video_kernel_size
+        self.alpha = alpha
+        self.merge_strategy = merge_strategy
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+        self.attn_level = attn_level
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = VideoResBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            video_kernel_size=self.video_kernel_size,
+            alpha=self.alpha,
+            merge_strategy=self.merge_strategy,
+        )
+        self.mid.attn_1 = Attention(block_in)
+        self.mid.block_2 = VideoResBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            video_kernel_size=self.video_kernel_size,
+            alpha=self.alpha,
+            merge_strategy=self.merge_strategy,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        self.attn_refinement = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    VideoResBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                        video_kernel_size=self.video_kernel_size,
+                        alpha=self.alpha,
+                        merge_strategy=self.merge_strategy,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(Attention(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+            if i_level in self.attn_level:
+                self.attn_refinement.insert(0, AttentionCombiner(block_in))
+            else:
+                self.attn_refinement.insert(0, ConvCombiner(block_in))
+        # end
+        self.norm_out = GroupNorm(block_in)
+        self.attn_refinement.append(ConvCombiner(block_in))
+        self.conv_out = DecoderConv3D(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1, video_kernel_size=self.video_kernel_size
+        )
+    def forward(self, z, ref_context=None, **kwargs):
+        ## ref_context: b c 2 h w, 2 means starting and ending frame
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb, **kwargs)
+        h = self.mid.attn_1(h, **kwargs)
+        h = self.mid.block_2(h, temb, **kwargs)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb, **kwargs)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, **kwargs)
+            if ref_context:
+                h = self.attn_refinement[i_level](x=h, context=ref_context[i_level])
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        if ref_context:
+            # print(h.shape, ref_context[i_level].shape) #torch.Size([8, 128, 256, 256]) torch.Size([1, 128, 2, 256, 256])
+            h = self.attn_refinement[-1](x=h, context=ref_context[-1])
+        h = self.conv_out(h, **kwargs)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+class TimeStackBlock(torch.nn.Module):
+    def __init__(
+            self,
+            channels: int,
+            emb_channels: int,
+            dropout: float,
+            out_channels: int = None,
+            use_conv: bool = False,
+            use_scale_shift_norm: bool = False,
+            dims: int = 2,
+            use_checkpoint: bool = False,
+            up: bool = False,
+            down: bool = False,
+            kernel_size: int = 3,
+            exchange_temb_dims: bool = False,
+            skip_t_emb: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.exchange_temb_dims = exchange_temb_dims
+        if isinstance(kernel_size, list):
+            padding = [k // 2 for k in kernel_size]
+        else:
+            padding = kernel_size // 2
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.skip_t_emb = skip_t_emb
+        self.emb_out_channels = (
+            2 * self.out_channels if use_scale_shift_norm else self.out_channels
+        )
+        if self.skip_t_emb:
+            # print(f"Skipping timestep embedding in {self.__class__.__name__}")
+            assert not self.use_scale_shift_norm
+            self.emb_layers = None
+            self.exchange_temb_dims = False
+        else:
+            self.emb_layers = nn.Sequential(
+                nn.SiLU(),
+                linear(
+                    emb_channels,
+                    self.emb_out_channels,
+                ),
+            )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(
+                    dims,
+                    self.out_channels,
+                    self.out_channels,
+                    kernel_size,
+                    padding=padding,
+                )
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, kernel_size, padding=padding
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        if self.skip_t_emb:
+            emb_out = torch.zeros_like(h)
+        else:
+            emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            if self.exchange_temb_dims:
+                emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class VideoResBlock(ResnetBlock):
+    def __init__(
+            self,
+            out_channels,
+            *args,
+            dropout=0.0,
+            video_kernel_size=3,
+            alpha=0.0,
+            merge_strategy="learned",
+            **kwargs,
+    ):
+        super().__init__(out_channels=out_channels, dropout=dropout, *args, **kwargs)
+        if video_kernel_size is None:
+            video_kernel_size = [3, 1, 1]
+        self.time_stack = TimeStackBlock(
+            channels=out_channels,
+            emb_channels=0,
+            dropout=dropout,
+            dims=3,
+            use_scale_shift_norm=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            kernel_size=video_kernel_size,
+            use_checkpoint=True,
+            skip_t_emb=True,
+        )
+        self.merge_strategy = merge_strategy
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned":
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+    def get_alpha(self, bs):
+        if self.merge_strategy == "fixed":
+            return self.mix_factor
+        elif self.merge_strategy == "learned":
+            return torch.sigmoid(self.mix_factor)
+        else:
+            raise NotImplementedError()
+    def forward(self, x, temb, skip_video=False, timesteps=None):
+        assert isinstance(timesteps, int)
+        b, c, h, w = x.shape
+        x = super().forward(x, temb)
+        if not skip_video:
+            x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+            x = self.time_stack(x, temb)
+            alpha = self.get_alpha(bs=b // timesteps)
+            x = alpha * x + (1.0 - alpha) * x_mix
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+class DecoderConv3D(torch.nn.Conv2d):
+    def __init__(self, in_channels, out_channels, video_kernel_size=3, *args, **kwargs):
+        super().__init__(in_channels, out_channels, *args, **kwargs)
+        if isinstance(video_kernel_size, list):
+            padding = [int(k // 2) for k in video_kernel_size]
+        else:
+            padding = int(video_kernel_size // 2)
+        self.time_mix_conv = torch.nn.Conv3d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=video_kernel_size,
+            padding=padding,
+        )
+    def forward(self, input, timesteps, skip_video=False):
+        x = super().forward(input)
+        if skip_video:
+            return x
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+        x = self.time_mix_conv(x)
+        return rearrange(x, "b c t h w -> (b t) c h w")
+class VideoAutoencoderKL(torch.nn.Module, PyTorchModelHubMixin):
+    def __init__(self,
+                 double_z=True,
+                 z_channels=4,
+                 resolution=256,
+                 in_channels=3,
+                 out_ch=3,
+                 ch=128,
+                 ch_mult=[],
+                 num_res_blocks=2,
+                 attn_resolutions=[],
+                 dropout=0.0,
+                 ):
+        super().__init__()
+        self.encoder = Encoder(double_z=double_z, z_channels=z_channels, resolution=resolution, in_channels=in_channels,
+                               out_ch=out_ch, ch=ch, ch_mult=ch_mult, num_res_blocks=num_res_blocks,
+                               attn_resolutions=attn_resolutions, dropout=dropout)
+        self.decoder = VideoDecoder(double_z=double_z, z_channels=z_channels, resolution=resolution,
+                                    in_channels=in_channels, out_ch=out_ch, ch=ch, ch_mult=ch_mult,
+                                    num_res_blocks=num_res_blocks, attn_resolutions=attn_resolutions, dropout=dropout)
+        self.quant_conv = torch.nn.Conv2d(2 * z_channels, 2 * z_channels, 1)
+        self.post_quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.scale_factor = 0.18215
+    def encode(self, x, return_hidden_states=False, **kwargs):
+        if return_hidden_states:
+            h, hidden = self.encoder(x, return_hidden_states)
+            moments = self.quant_conv(h)
+            posterior = DiagonalGaussianDistribution(moments)
+            return posterior, hidden
+        else:
+            h = self.encoder(x)
+            moments = self.quant_conv(h)
+            posterior = DiagonalGaussianDistribution(moments)
+            return posterior, None
+    def decode(self, z, **kwargs):
+        if len(kwargs) == 0:
+            z = self.post_quant_conv(z)
+        dec = self.decoder(z, **kwargs)
+        return dec
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

gradio_app.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import os
+os.environ['HF_HOME'] = os.path.join(os.path.dirname(__file__), 'hf_download')
+result_dir = os.path.join('./', 'results')
+os.makedirs(result_dir, exist_ok=True)
+import functools
+import os
+import random
+import gradio as gr
+import numpy as np
+import torch
+import wd14tagger
+import memory_management
+import uuid
+from PIL import Image
+from diffusers_helper.code_cond import unet_add_coded_conds
+from diffusers_helper.cat_cond import unet_add_concat_conds
+from diffusers_helper.k_diffusion import KDiffusionSampler
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor2_0
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers_vdm.pipeline import LatentVideoDiffusionPipeline
+from diffusers_vdm.utils import resize_and_center_crop, save_bcthw_as_mp4
+import spaces
+class ModifiedUNet(UNet2DConditionModel):
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        m = super().from_config(*args, **kwargs)
+        unet_add_concat_conds(unet=m, new_channels=4)
+        unet_add_coded_conds(unet=m, added_number_count=1)
+        return m
+model_name = 'lllyasviel/paints_undo_single_frame'
+tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16)
+vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(torch.bfloat16)  # bfloat16 vae
+unet = ModifiedUNet.from_pretrained(model_name, subfolder="unet").to(torch.float16)
+unet.set_attn_processor(AttnProcessor2_0())
+vae.set_attn_processor(AttnProcessor2_0())
+video_pipe = LatentVideoDiffusionPipeline.from_pretrained(
+    'lllyasviel/paints_undo_multi_frame',
+    fp16=True
+)
+memory_management.unload_all_models([
+    video_pipe.unet, video_pipe.vae, video_pipe.text_encoder, video_pipe.image_projection, video_pipe.image_encoder,
+    unet, vae, text_encoder
+])
+k_sampler = KDiffusionSampler(
+    unet=unet,
+    timesteps=1000,
+    linear_start=0.00085,
+    linear_end=0.020,
+    linear=True
+)
+def find_best_bucket(h, w, options):
+    min_metric = float('inf')
+    best_bucket = None
+    for (bucket_h, bucket_w) in options:
+        metric = abs(h * bucket_w - w * bucket_h)
+        if metric <= min_metric:
+            min_metric = metric
+            best_bucket = (bucket_h, bucket_w)
+    return best_bucket
+@torch.inference_mode()
+def encode_cropped_prompt_77tokens(txt: str):
+    memory_management.load_models_to_gpu(text_encoder)
+    cond_ids = tokenizer(txt,
+                         padding="max_length",
+                         max_length=tokenizer.model_max_length,
+                         truncation=True,
+                         return_tensors="pt").input_ids.to(device=text_encoder.device)
+    text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
+    return text_cond
+@torch.inference_mode()
+def pytorch2numpy(imgs):
+    results = []
+    for x in imgs:
+        y = x.movedim(0, -1)
+        y = y * 127.5 + 127.5
+        y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
+        results.append(y)
+    return results
+@torch.inference_mode()
+def numpy2pytorch(imgs):
+    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
+    h = h.movedim(-1, 1)
+    return h
+def resize_without_crop(image, target_width, target_height):
+    pil_image = Image.fromarray(image)
+    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
+    return np.array(resized_image)
+@torch.inference_mode()
+@spaces.GPU
+def interrogator_process(x):
+    return wd14tagger.default_interrogator(x)
+@torch.inference_mode()
+@spaces.GPU
+def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
+            progress=gr.Progress()):
+    rng = torch.Generator(device=memory_management.gpu).manual_seed(int(seed))
+    memory_management.load_models_to_gpu(vae)
+    fg = resize_and_center_crop(input_fg, image_width, image_height)
+    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
+    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
+    memory_management.load_models_to_gpu(text_encoder)
+    conds = encode_cropped_prompt_77tokens(prompt)
+    unconds = encode_cropped_prompt_77tokens(n_prompt)
+    memory_management.load_models_to_gpu(unet)
+    fs = torch.tensor(input_undo_steps).to(device=unet.device, dtype=torch.long)
+    initial_latents = torch.zeros_like(concat_conds)
+    concat_conds = concat_conds.to(device=unet.device, dtype=unet.dtype)
+    latents = k_sampler(
+        initial_latent=initial_latents,
+        strength=1.0,
+        num_inference_steps=steps,
+        guidance_scale=cfg,
+        batch_size=len(input_undo_steps),
+        generator=rng,
+        prompt_embeds=conds,
+        negative_prompt_embeds=unconds,
+        cross_attention_kwargs={'concat_conds': concat_conds, 'coded_conds': fs},
+        same_noise_in_batch=True,
+        progress_tqdm=functools.partial(progress.tqdm, desc='Generating Key Frames')
+    ).to(vae.dtype) / vae.config.scaling_factor
+    memory_management.load_models_to_gpu(vae)
+    pixels = vae.decode(latents).sample
+    pixels = pytorch2numpy(pixels)
+    pixels = [fg] + pixels + [np.zeros_like(fg) + 255]
+    return pixels
+@torch.inference_mode()
+def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=7.5, fs=3, progress_tqdm=None):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    frames = 16
+    target_height, target_width = find_best_bucket(
+        image_1.shape[0], image_1.shape[1],
+        options=[(320, 512), (384, 448), (448, 384), (512, 320)]
+    )
+    image_1 = resize_and_center_crop(image_1, target_width=target_width, target_height=target_height)
+    image_2 = resize_and_center_crop(image_2, target_width=target_width, target_height=target_height)
+    input_frames = numpy2pytorch([image_1, image_2])
+    input_frames = input_frames.unsqueeze(0).movedim(1, 2)
+    memory_management.load_models_to_gpu(video_pipe.text_encoder)
+    positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
+    negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
+    memory_management.load_models_to_gpu([video_pipe.image_projection, video_pipe.image_encoder])
+    input_frames = input_frames.to(device=video_pipe.image_encoder.device, dtype=video_pipe.image_encoder.dtype)
+    positive_image_cond = video_pipe.encode_clip_vision(input_frames)
+    positive_image_cond = video_pipe.image_projection(positive_image_cond)
+    negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
+    negative_image_cond = video_pipe.image_projection(negative_image_cond)
+    memory_management.load_models_to_gpu([video_pipe.vae])
+    input_frames = input_frames.to(device=video_pipe.vae.device, dtype=video_pipe.vae.dtype)
+    input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
+    first_frame = input_frame_latents[:, :, 0]
+    last_frame = input_frame_latents[:, :, 1]
+    concat_cond = torch.stack([first_frame] + [torch.zeros_like(first_frame)] * (frames - 2) + [last_frame], dim=2)
+    memory_management.load_models_to_gpu([video_pipe.unet])
+    latents = video_pipe(
+        batch_size=1,
+        steps=int(steps),
+        guidance_scale=cfg_scale,
+        positive_text_cond=positive_text_cond,
+        negative_text_cond=negative_text_cond,
+        positive_image_cond=positive_image_cond,
+        negative_image_cond=negative_image_cond,
+        concat_cond=concat_cond,
+        fs=fs,
+        progress_tqdm=progress_tqdm
+    )
+    memory_management.load_models_to_gpu([video_pipe.vae])
+    video = video_pipe.decode_latents(latents, vae_hidden_states)
+    return video, image_1, image_2
+@torch.inference_mode()
+@spaces.GPU
+def process_video(keyframes, prompt, steps, cfg, fps, seed, progress=gr.Progress()):
+    result_frames = []
+    cropped_images = []
+    for i, (im1, im2) in enumerate(zip(keyframes[:-1], keyframes[1:])):
+        im1 = np.array(Image.open(im1[0]))
+        im2 = np.array(Image.open(im2[0]))
+        frames, im1, im2 = process_video_inner(
+            im1, im2, prompt, seed=seed + i, steps=steps, cfg_scale=cfg, fs=3,
+            progress_tqdm=functools.partial(progress.tqdm, desc=f'Generating Videos ({i + 1}/{len(keyframes) - 1})')
+        )
+        result_frames.append(frames[:, :, :-1, :, :])
+        cropped_images.append([im1, im2])
+    video = torch.cat(result_frames, dim=2)
+    video = torch.flip(video, dims=[2])
+    uuid_name = str(uuid.uuid4())
+    output_filename = os.path.join(result_dir, uuid_name + '.mp4')
+    Image.fromarray(cropped_images[0][0]).save(os.path.join(result_dir, uuid_name + '.png'))
+    video = save_bcthw_as_mp4(video, output_filename, fps=fps)
+    video = [x.cpu().numpy() for x in video]
+    return output_filename, video
+block = gr.Blocks().queue()
+with block:
+    gr.Markdown('# Paints-Undo')
+    with gr.Accordion(label='Step 1: Upload Image and Generate Prompt', open=True):
+        with gr.Row():
+            with gr.Column():
+                input_fg = gr.Image(sources=['upload'], type="numpy", label="Image", height=512)
+            with gr.Column():
+                prompt_gen_button = gr.Button(value="Generate Prompt", interactive=False)
+                prompt = gr.Textbox(label="Output Prompt", interactive=True)
+    with gr.Accordion(label='Step 2: Generate Key Frames', open=True):
+        with gr.Row():
+            with gr.Column():
+                input_undo_steps = gr.Dropdown(label="Operation Steps", value=[400, 600, 800, 900, 950, 999],
+                                               choices=list(range(1000)), multiselect=True)
+                seed = gr.Slider(label='Stage 1 Seed', minimum=0, maximum=50000, step=1, value=12345)
+                image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64)
+                image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64)
+                steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=50, step=1)
+                cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=3.0, step=0.01)
+                n_prompt = gr.Textbox(label="Negative Prompt",
+                                      value='lowres, bad anatomy, bad hands, cropped, worst quality')
+            with gr.Column():
+                key_gen_button = gr.Button(value="Generate Key Frames", interactive=False)
+                result_gallery = gr.Gallery(height=512, object_fit='contain', label='Outputs', columns=4)
+    with gr.Accordion(label='Step 3: Generate All Videos', open=True):
+        with gr.Row():
+            with gr.Column():
+                i2v_input_text = gr.Text(label='Prompts', value='1girl, masterpiece, best quality')
+                i2v_seed = gr.Slider(label='Stage 2 Seed', minimum=0, maximum=50000, step=1, value=123)
+                i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5,
+                                          elem_id="i2v_cfg_scale")
+                i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="i2v_steps",
+                                      label="Sampling steps", value=50)
+                i2v_fps = gr.Slider(minimum=1, maximum=30, step=1, elem_id="i2v_motion", label="FPS", value=4)
+            with gr.Column():
+                i2v_end_btn = gr.Button("Generate Video", interactive=False)
+                i2v_output_video = gr.Video(label="Generated Video", elem_id="output_vid", autoplay=True,
+                                            show_share_button=True, height=512)
+        with gr.Row():
+            i2v_output_images = gr.Gallery(height=512, label="Output Frames", object_fit="contain", columns=8)
+    input_fg.change(lambda: ["", gr.update(interactive=True), gr.update(interactive=False), gr.update(interactive=False)],
+                    outputs=[prompt, prompt_gen_button, key_gen_button, i2v_end_btn])
+    prompt_gen_button.click(
+        fn=interrogator_process,
+        inputs=[input_fg],
+        outputs=[prompt]
+    ).then(lambda: [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=False)],
+           outputs=[prompt_gen_button, key_gen_button, i2v_end_btn])
+    key_gen_button.click(
+        fn=process,
+        inputs=[input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg],
+        outputs=[result_gallery]
+    ).then(lambda: [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)],
+           outputs=[prompt_gen_button, key_gen_button, i2v_end_btn])
+    i2v_end_btn.click(
+        inputs=[result_gallery, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_fps, i2v_seed],
+        outputs=[i2v_output_video, i2v_output_images],
+        fn=process_video
+    )
+    dbs = [
+        ['./imgs/1.jpg', 12345, 123],
+        ['./imgs/2.jpg', 37000, 12345],
+        ['./imgs/3.jpg', 3000, 3000],
+    ]
+    gr.Examples(
+        examples=dbs,
+        inputs=[input_fg, seed, i2v_seed],
+        examples_per_page=1024
+    )
+block.queue().launch(server_name='0.0.0.0')

imgs/1.jpg ADDED Viewed

imgs/2.jpg ADDED Viewed

imgs/3.jpg ADDED Viewed

memory_management.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+from contextlib import contextmanager
+high_vram = False
+gpu = torch.device('cuda')
+cpu = torch.device('cpu')
+torch.zeros((1, 1)).to(gpu, torch.float32)
+torch.cuda.empty_cache()
+models_in_gpu = []
+@contextmanager
+def movable_bnb_model(m):
+    if hasattr(m, 'quantization_method'):
+        m.quantization_method_backup = m.quantization_method
+        del m.quantization_method
+    try:
+        yield None
+    finally:
+        if hasattr(m, 'quantization_method_backup'):
+            m.quantization_method = m.quantization_method_backup
+            del m.quantization_method_backup
+    return
+def load_models_to_gpu(models):
+    global models_in_gpu
+    if not isinstance(models, (tuple, list)):
+        models = [models]
+    models_to_remain = [m for m in set(models) if m in models_in_gpu]
+    models_to_load = [m for m in set(models) if m not in models_in_gpu]
+    models_to_unload = [m for m in set(models_in_gpu) if m not in models_to_remain]
+    if not high_vram:
+        for m in models_to_unload:
+            with movable_bnb_model(m):
+                m.to(cpu)
+            print('Unload to CPU:', m.__class__.__name__)
+        models_in_gpu = models_to_remain
+    for m in models_to_load:
+        with movable_bnb_model(m):
+            m.to(gpu)
+        print('Load to GPU:', m.__class__.__name__)
+    models_in_gpu = list(set(models_in_gpu + models))
+    torch.cuda.empty_cache()
+    return
+def unload_all_models(extra_models=None):
+    global models_in_gpu
+    if extra_models is None:
+        extra_models = []
+    if not isinstance(extra_models, (tuple, list)):
+        extra_models = [extra_models]
+    models_in_gpu = list(set(models_in_gpu + extra_models))
+    return load_models_to_gpu([])

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+diffusers==0.28.0
+transformers==4.41.1
+gradio==4.31.5
+bitsandbytes==0.43.1
+accelerate==0.30.1
+protobuf==3.20
+opencv-python
+tensorboardX
+safetensors
+pillow
+einops
+torch
+peft
+xformers
+onnxruntime
+av
+torchvision
+xformers
+spaces

wd14tagger.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# https://huggingface.co/spaces/SmilingWolf/wd-v1-4-tags
+import os
+import csv
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+from onnxruntime import InferenceSession
+from torch.hub import download_url_to_file
+global_model = None
+global_csv = None
+def download_model(url, local_path):
+    if os.path.exists(local_path):
+        return local_path
+    temp_path = local_path + '.tmp'
+    download_url_to_file(url=url, dst=temp_path)
+    os.rename(temp_path, local_path)
+    return local_path
+def default_interrogator(image, threshold=0.35, character_threshold=0.85, exclude_tags=""):
+    global global_model, global_csv
+    model_name = "wd-v1-4-moat-tagger-v2"
+    model_onnx_filename = download_model(
+        url=f'https://huggingface.co/lllyasviel/misc/resolve/main/{model_name}.onnx',
+        local_path=f'./{model_name}.onnx',
+    )
+    model_csv_filename = download_model(
+        url=f'https://huggingface.co/lllyasviel/misc/resolve/main/{model_name}.csv',
+        local_path=f'./{model_name}.csv',
+    )
+    if global_model is not None:
+        model = global_model
+    else:
+        # assert 'CUDAExecutionProvider' in ort.get_available_providers(), 'CUDA Install Failed!'
+        # model = InferenceSession(model_onnx_filename, providers=['CUDAExecutionProvider'])
+        model = InferenceSession(model_onnx_filename, providers=['CPUExecutionProvider'])
+        global_model = model
+    input = model.get_inputs()[0]
+    height = input.shape[1]
+    if isinstance(image, str):
+        image = Image.open(image)  # RGB
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    else:
+        image = image
+    ratio = float(height) / max(image.size)
+    new_size = tuple([int(x*ratio) for x in image.size])
+    image = image.resize(new_size, Image.LANCZOS)
+    square = Image.new("RGB", (height, height), (255, 255, 255))
+    square.paste(image, ((height-new_size[0])//2, (height-new_size[1])//2))
+    image = np.array(square).astype(np.float32)
+    image = image[:, :, ::-1]  # RGB -> BGR
+    image = np.expand_dims(image, 0)
+    if global_csv is not None:
+        csv_lines = global_csv
+    else:
+        csv_lines = []
+        with open(model_csv_filename) as f:
+            reader = csv.reader(f)
+            next(reader)
+            for row in reader:
+                csv_lines.append(row)
+        global_csv = csv_lines
+    tags = []
+    general_index = None
+    character_index = None
+    for line_num, row in enumerate(csv_lines):
+        if general_index is None and row[2] == "0":
+            general_index = line_num
+        elif character_index is None and row[2] == "4":
+            character_index = line_num
+        tags.append(row[1])
+    label_name = model.get_outputs()[0].name
+    probs = model.run([label_name], {input.name: image})[0]
+    result = list(zip(tags, probs[0]))
+    general = [item for item in result[general_index:character_index] if item[1] > threshold]
+    character = [item for item in result[character_index:] if item[1] > character_threshold]
+    all = character + general
+    remove = [s.strip() for s in exclude_tags.lower().split(",")]
+    all = [tag for tag in all if tag[0] not in remove]
+    res = ", ".join((item[0].replace("(", "\\(").replace(")", "\\)") for item in all)).replace('_', ' ')
+    return res