Spaces:

gustproof
/

1girl-EDM2-XS-test-1-demo

Sleeping

App Files Files Community

gustproof commited on Nov 2, 2024

Commit

fb3e84a

1 Parent(s): 131e57e

First

Browse files

Files changed (10) hide show

.gitignore +2 -0
app.py +108 -0
lib/cond_gen.py +10 -0
lib/embedding.py +77 -0
lib/encoders.py +171 -0
lib/get_model.py +13 -0
lib/networks_edm2.py +360 -0
lib/sampling.py +63 -0
model_weights/1girl-edm-xs-test-1.safetensors +3 -0
model_weights/condgen_vae_decoder.safetensors +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ *.ipynb

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from lib.get_model import get_model, device
+from lib.sampling import edm_sampler
+from lib.embedding import extract_features
+from lib.encoders import StabilityVAEEncoder
+from lib.cond_gen import get_vae_decoder
+from safetensors.torch import load_file
+from torchvision.transforms import ToPILImage
+import torch
+import gradio as gr
+import json
+torch.set_grad_enabled(False)
+net = get_model()
+net.load_state_dict(load_file("model_weights/1girl-edm-xs-test-1.safetensors"))
+cond_gen = get_vae_decoder().to(device)
+cond_gen.load_state_dict(load_file("model_weights/condgen_vae_decoder.safetensors"))
+stability_encoder = StabilityVAEEncoder()
+def guided(net, scale=1):
+    def f(x, t, label):
+        if scale == 1:
+            return net(x, t, label)
+        return torch.lerp(net(x, t, net.uncond_emb), net(x, t, label), float(scale))
+    return f
+@torch.no_grad()
+def generate_image(label, guidance_scale, n_steps, seed):
+    label = torch.tensor(label)[None].to(device)
+    gen = torch.Generator(device).manual_seed(seed)
+    x = torch.randn((1, 4, 88, 64), device=device, generator=gen)
+    randn_like = lambda *a, **ka: torch.zeros_like(*a, **ka).normal_(generator=gen)
+    im = edm_sampler(
+        guided(net, guidance_scale), x, label, num_steps=n_steps, randn_like=randn_like
+    )
+    im = stability_encoder.decode(im)
+    return ToPILImage()(im[0])
+with gr.Blocks() as demo:
+    selected = [0]
+    with gr.Row():
+        gr.Markdown(
+            """# 1girl-EDM2-XS-test-1 Demo
+                        Demo of a 125M param model trained in 1 GPU-day for generating `1girl solo` images.
+                    """
+        )
+    with gr.Row():
+        with gr.Column():
+            with gr.Group():
+                btn = gr.Button("Generate", variant="primary")
+                guidance = gr.Slider(1, 15, 5, step=0.1, label="Guidance scale")
+                n_steps = gr.Slider(2, 35, 24, step=1, label="Inference steps")
+                seed = gr.Slider(
+                    -1, 2147483647, -1, step=1, label="Random seed (-1: randomize)"
+                )
+                with gr.Tab("Condition: auto") as auto_tab:
+                    gr.Markdown("Conditioning is generated with an external model")
+                with gr.Tab("Condition: from image") as img_tab:
+                    gr.Markdown(
+                        "Conditioning is extracted from the image with a [tagger](https://huggingface.co/SmilingWolf/wd-eva02-large-tagger-v3)"
+                    )
+                    ref_im = gr.Image(label="Reference image", type="pil")
+                with gr.Tab("Condition: precomputed") as txt_tab:
+                    gr.Markdown("Use a precomputed 1024D vector a the condition")
+                    ref_txt = gr.TextArea(
+                        label="Precomputed conditioning",
+                        placeholder="Copy & Paste from the output",
+                    )
+        with gr.Column():
+            out_im = gr.Image(label="Generated Image", show_download_button=True)
+            out_seed = gr.Textbox(label="Seed", show_copy_button=True)
+            out_emb = gr.TextArea(label="Condition vector", show_copy_button=True)
+        @torch.no_grad()
+        def get_label(tab_index, cond_img=None, cond_txt=None):
+            if tab_index == 0:
+                return cond_gen(torch.randn((1, 512), device=device))[0].detach().cpu()
+            if tab_index == 1:
+                return extract_features(cond_img, device)
+            return torch.tensor(json.loads(cond_txt))
+        def on_select(e: gr.SelectData):
+            selected[0] = e.index
+        for t in [auto_tab, img_tab, txt_tab]:
+            t.select(on_select)
+    def main(guidance, n_steps, seed, cond_img=None, cond_txt=None):
+        if seed < 0:
+            seed = torch.randint(0, 2147483647, ()).item()
+        label = get_label(selected[0], cond_img, cond_txt)
+        im = generate_image(label, guidance, n_steps, seed)
+        label_txt = json.dumps(label.numpy().astype(float).round(3).tolist())
+        return im, seed, label_txt
+    btn.click(
+        main, [guidance, n_steps, seed, ref_im, ref_txt], [out_im, out_seed, out_emb]
+    )
+demo.launch()

lib/cond_gen.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from torch import nn
+def get_vae_decoder():
+    return nn.Sequential(
+        nn.Linear(512, 512),
+        nn.SiLU(),
+        nn.Linear(512, 768),
+        nn.SiLU(),
+        nn.Linear(768, 1024),
+    )

lib/embedding.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Modified from https://huggingface.co/spaces/SmilingWolf/wd-tagger/blob/main/app.py
+import os
+from PIL import Image
+import timm
+import torch
+if torch.cuda.is_available():
+    os.environ["ONNX_MODE"] = "cuda"
+EVA02_LARGE_MODEL_DSV3_REPO = "SmilingWolf/wd-eva02-large-tagger-v3"
+class Predictor:
+    def __init__(self):
+        self.model_target_size = None
+        self.last_loaded_repo = None
+    def load_model(self, model_repo):
+        if model_repo == self.last_loaded_repo:
+            return
+        model = timm.create_model("hf-hub:" + model_repo).eval()
+        state_dict = timm.models.load_state_dict_from_hf(model_repo)
+        model.load_state_dict(state_dict)
+        self.transform = timm.data.create_transform(
+            **timm.data.resolve_data_config(model.pretrained_cfg, model=model)
+        )
+        self.model_target_size = self.transform.transforms[0].size
+        self.last_loaded_repo = model_repo
+        self.model = model
+    def prepare_image(self, image):
+        target_size = self.model_target_size
+        canvas = Image.new("RGBA", image.size, (255, 255, 255))
+        canvas.alpha_composite(image)
+        image = canvas.convert("RGB")
+        # Pad image to square
+        image_shape = image.size
+        max_dim = max(image_shape)
+        pad_left = (max_dim - image_shape[0]) // 2
+        pad_top = (max_dim - image_shape[1]) // 2
+        padded_image = Image.new("RGB", (max_dim, max_dim), (255, 255, 255))
+        padded_image.paste(image, (pad_left, pad_top))
+        # Resize
+        if max_dim != target_size:
+            padded_image = padded_image.resize(
+                (target_size, target_size),
+                Image.BICUBIC,
+            )
+        return self.transform(padded_image)[[2, 1, 0]].clone()
+    @torch.no_grad()
+    def predict(
+        self,
+        images,
+        model_repo,
+    ):
+        self.load_model(model_repo)
+        feat = self.model.forward_features(images)
+        feat_pooled = feat[:, self.model.num_prefix_tokens :].mean(dim=1)
+        return feat_pooled
+predictor = Predictor()
+predictor.load_model(EVA02_LARGE_MODEL_DSV3_REPO)
+def extract_features(im: Image.Image, device):
+    predictor.model.to(device)
+    ims = predictor.prepare_image(im.convert("RGBA"))[None]
+    feat = predictor.predict(ims.to(device), EVA02_LARGE_MODEL_DSV3_REPO)
+    return feat[0].cpu()

lib/encoders.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# This work is licensed under a Creative Commons
+# Attribution-NonCommercial-ShareAlike 4.0 International License.
+# You should have received a copy of the license along with this
+# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/
+"""Converting between pixel and latent representations of image data."""
+import warnings
+import numpy as np
+import torch
+warnings.filterwarnings('ignore', 'torch.utils._pytree._register_pytree_node is deprecated.')
+warnings.filterwarnings('ignore', '`resume_download` is deprecated')
+_constant_cache = dict()
+def constant(value, shape=None, dtype=None, device=None, memory_format=None):
+    value = np.asarray(value)
+    if shape is not None:
+        shape = tuple(shape)
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.device('cpu')
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+    key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
+    tensor = _constant_cache.get(key, None)
+    if tensor is None:
+        tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
+        if shape is not None:
+            tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
+        tensor = tensor.contiguous(memory_format=memory_format)
+        _constant_cache[key] = tensor
+    return tensor
+#----------------------------------------------------------------------------
+# Variant of constant() that inherits dtype and device from the given
+# reference tensor by default.
+def const_like(ref, value, shape=None, dtype=None, device=None, memory_format=None):
+    if dtype is None:
+        dtype = ref.dtype
+    if device is None:
+        device = ref.device
+    return constant(value, shape=shape, dtype=dtype, device=device, memory_format=memory_format)
+#----------------------------------------------------------------------------
+# Abstract base class for encoders/decoders that convert back and forth
+# between pixel and latent representations of image data.
+#
+# Logically, "raw pixels" are first encoded into "raw latents" that are
+# then further encoded into "final latents". Decoding, on the other hand,
+# goes directly from the final latents to raw pixels. The final latents are
+# used as inputs and outputs of the model, whereas the raw latents are
+# stored in the dataset. This separation provides added flexibility in terms
+# of performing just-in-time adjustments, such as data whitening, without
+# having to construct a new dataset.
+#
+# All image data is represented as PyTorch tensors in NCHW order.
+# Raw pixels are represented as 3-channel uint8.
+class Encoder:
+    def __init__(self):
+        pass
+    def init(self, device): # force lazy init to happen now
+        pass
+    def __getstate__(self):
+        return self.__dict__
+    def encode(self, x): # raw pixels => final latents
+        return self.encode_latents(self.encode_pixels(x))
+    def encode_pixels(self, x): # raw pixels => raw latents
+        raise NotImplementedError # to be overridden by subclass
+    def encode_latents(self, x): # raw latents => final latents
+        raise NotImplementedError # to be overridden by subclass
+    def decode(self, x): # final latents => raw pixels
+        raise NotImplementedError # to be overridden by subclass
+#----------------------------------------------------------------------------
+# Standard RGB encoder that scales the pixel data into [-1, +1].
+class StandardRGBEncoder(Encoder):
+    def __init__(self):
+        super().__init__()
+    def encode_pixels(self, x): # raw pixels => raw latents
+        return x
+    def encode_latents(self, x): # raw latents => final latents
+        return x.to(torch.float32) / 127.5 - 1
+    def decode(self, x): # final latents => raw pixels
+        return (x.to(torch.float32) * 127.5 + 128).clip(0, 255).to(torch.uint8)
+#----------------------------------------------------------------------------
+# Pre-trained VAE encoder from Stability AI.
+class StabilityVAEEncoder(Encoder):
+    def __init__(self,
+        vae_name    = 'stabilityai/sd-vae-ft-mse',  # Name of the VAE to use.
+        raw_mean    = [5.81, 3.25, 0.12, -2.15],    # Assumed mean of the raw latents.
+        raw_std     = [4.17, 4.62, 3.71, 3.28],     # Assumed standard deviation of the raw latents.
+        final_mean  = 0,                            # Desired mean of the final latents.
+        final_std   = 0.5,                          # Desired standard deviation of the final latents.
+        batch_size  = 8,                            # Batch size to use when running the VAE.
+    ):
+        super().__init__()
+        self.vae_name = vae_name
+        self.scale = np.float32(final_std) / np.float32(raw_std)
+        self.bias = np.float32(final_mean) - np.float32(raw_mean) * self.scale
+        self.batch_size = int(batch_size)
+        self._vae = None
+    def init(self, device): # force lazy init to happen now
+        super().init(device)
+        if self._vae is None:
+            self._vae = load_stability_vae(self.vae_name, device=device)
+        else:
+            self._vae.to(device)
+    def __getstate__(self):
+        return dict(super().__getstate__(), _vae=None) # do not pickle the vae
+    def _run_vae_encoder(self, x):
+        d = self._vae.encode(x)['latent_dist']
+        return torch.cat([d.mean, d.std], dim=1)
+    def _run_vae_decoder(self, x):
+        return self._vae.decode(x)['sample']
+    def encode_pixels(self, x): # raw pixels => raw latents
+        self.init(x.device)
+        x = x.to(torch.float32) / 255
+        x = torch.cat([self._run_vae_encoder(batch) for batch in x.split(self.batch_size)])
+        return x
+    def encode_latents(self, x): # raw latents => final latents
+        mean, std = x.to(torch.float32).chunk(2, dim=1)
+        x = mean + torch.randn_like(mean) * std
+        x = x * const_like(x, self.scale).reshape(1, -1, 1, 1)
+        x = x + const_like(x, self.bias).reshape(1, -1, 1, 1)
+        return x
+    def decode(self, x): # final latents => raw pixels
+        self.init(x.device)
+        x = x.to(torch.float32)
+        x = x - const_like(x, self.bias).reshape(1, -1, 1, 1)
+        x = x / const_like(x, self.scale).reshape(1, -1, 1, 1)
+        x = torch.cat([self._run_vae_decoder(batch) for batch in x.split(self.batch_size)])
+        x = x.clamp(0, 1).mul(255).to(torch.uint8)
+        return x
+#----------------------------------------------------------------------------
+def load_stability_vae(vae_name='stabilityai/sd-vae-ft-mse', device=torch.device('cpu')):
+    import diffusers # pip install diffusers # pyright: ignore [reportMissingImports]
+    vae = diffusers.models.AutoencoderKL.from_pretrained(vae_name)
+    return vae.eval().requires_grad_(False).to(device)
+#----------------------------------------------------------------------------

lib/get_model.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+from lib.networks_edm2 import Precond
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def get_model():
+    return Precond(
+        img_resolution=64, # actually 88x64
+        img_channels=4,
+        label_dim=1024,
+        use_fp16=False,
+        model_channels=128,
+    ).to(device)

lib/networks_edm2.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# This work is licensed under a Creative Commons
+# Attribution-NonCommercial-ShareAlike 4.0 International License.
+# You should have received a copy of the license along with this
+# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/
+"""Improved diffusion model architecture proposed in the paper
+"Analyzing and Improving the Training Dynamics of Diffusion Models"."""
+import numpy as np
+import torch
+#----------------------------------------------------------------------------
+# Cached construction of constant tensors. Avoids CPU=>GPU copy when the
+# same constant is used multiple times.
+_constant_cache = dict()
+def constant(value, shape=None, dtype=None, device=None, memory_format=None):
+    value = np.asarray(value)
+    if shape is not None:
+        shape = tuple(shape)
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.device('cpu')
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+    key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
+    tensor = _constant_cache.get(key, None)
+    if tensor is None:
+        tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
+        if shape is not None:
+            tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
+        tensor = tensor.contiguous(memory_format=memory_format)
+        _constant_cache[key] = tensor
+    return tensor
+#----------------------------------------------------------------------------
+# Variant of constant() that inherits dtype and device from the given
+# reference tensor by default.
+def const_like(ref, value, shape=None, dtype=None, device=None, memory_format=None):
+    if dtype is None:
+        dtype = ref.dtype
+    if device is None:
+        device = ref.device
+    return constant(value, shape=shape, dtype=dtype, device=device, memory_format=memory_format)
+#----------------------------------------------------------------------------
+# Normalize given tensor to unit magnitude with respect to the given
+# dimensions. Default = all dimensions except the first.
+def normalize(x, dim=None, eps=1e-4):
+    if dim is None:
+        dim = list(range(1, x.ndim))
+    norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32)
+    norm = torch.add(eps, norm, alpha=np.sqrt(norm.numel() / x.numel()))
+    return x / norm.to(x.dtype)
+#----------------------------------------------------------------------------
+# Upsample or downsample the given tensor with the given filter,
+# or keep it as is.
+def resample(x, f=[1,1], mode='keep'):
+    if mode == 'keep':
+        return x
+    f = np.float32(f)
+    assert f.ndim == 1 and len(f) % 2 == 0
+    pad = (len(f) - 1) // 2
+    f = f / f.sum()
+    f = np.outer(f, f)[np.newaxis, np.newaxis, :, :]
+    f = const_like(x, f)
+    c = x.shape[1]
+    if mode == 'down':
+        return torch.nn.functional.conv2d(x, f.tile([c, 1, 1, 1]), groups=c, stride=2, padding=(pad,))
+    assert mode == 'up'
+    return torch.nn.functional.conv_transpose2d(x, (f * 4).tile([c, 1, 1, 1]), groups=c, stride=2, padding=(pad,))
+#----------------------------------------------------------------------------
+# Magnitude-preserving SiLU (Equation 81).
+def mp_silu(x):
+    return torch.nn.functional.silu(x) / 0.596
+#----------------------------------------------------------------------------
+# Magnitude-preserving sum (Equation 88).
+def mp_sum(a, b, t=0.5):
+    return a.lerp(b, t) / np.sqrt((1 - t) ** 2 + t ** 2)
+#----------------------------------------------------------------------------
+# Magnitude-preserving concatenation (Equation 103).
+def mp_cat(a, b, dim=1, t=0.5):
+    Na = a.shape[dim]
+    Nb = b.shape[dim]
+    C = np.sqrt((Na + Nb) / ((1 - t) ** 2 + t ** 2))
+    wa = C / np.sqrt(Na) * (1 - t)
+    wb = C / np.sqrt(Nb) * t
+    return torch.cat([wa * a , wb * b], dim=dim)
+#----------------------------------------------------------------------------
+# Magnitude-preserving Fourier features (Equation 75).
+class MPFourier(torch.nn.Module):
+    def __init__(self, num_channels, bandwidth=1):
+        super().__init__()
+        self.register_buffer('freqs', 2 * np.pi * torch.randn(num_channels) * bandwidth)
+        self.register_buffer('phases', 2 * np.pi * torch.rand(num_channels))
+    def forward(self, x):
+        y = x.to(torch.float32)
+        y = y.ger(self.freqs.to(torch.float32))
+        y = y + self.phases.to(torch.float32)
+        y = y.cos() * np.sqrt(2)
+        return y.to(x.dtype)
+#----------------------------------------------------------------------------
+# Magnitude-preserving convolution or fully-connected layer (Equation 47)
+# with force weight normalization (Equation 66).
+class MPConv(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel):
+        super().__init__()
+        self.out_channels = out_channels
+        self.weight = torch.nn.Parameter(torch.randn(out_channels, in_channels, *kernel))
+    def forward(self, x, gain=1):
+        w = self.weight.to(torch.float32)
+        if self.training:
+            with torch.no_grad():
+                self.weight.copy_(normalize(w)) # forced weight normalization
+        w = normalize(w) # traditional weight normalization
+        w = w * (gain / np.sqrt(w[0].numel())) # magnitude-preserving scaling
+        w = w.to(x.dtype)
+        if w.ndim == 2:
+            return x @ w.t()
+        assert w.ndim == 4
+        return torch.nn.functional.conv2d(x, w, padding=(w.shape[-1]//2,))
+#----------------------------------------------------------------------------
+# U-Net encoder/decoder block with optional self-attention (Figure 21).
+class Block(torch.nn.Module):
+    def __init__(self,
+        in_channels,                    # Number of input channels.
+        out_channels,                   # Number of output channels.
+        emb_channels,                   # Number of embedding channels.
+        flavor              = 'enc',    # Flavor: 'enc' or 'dec'.
+        resample_mode       = 'keep',   # Resampling: 'keep', 'up', or 'down'.
+        resample_filter     = [1,1],    # Resampling filter.
+        attention           = False,    # Include self-attention?
+        channels_per_head   = 64,       # Number of channels per attention head.
+        dropout             = 0,        # Dropout probability.
+        res_balance         = 0.3,      # Balance between main branch (0) and residual branch (1).
+        attn_balance        = 0.3,      # Balance between main branch (0) and self-attention (1).
+        clip_act            = 256,      # Clip output activations. None = do not clip.
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.flavor = flavor
+        self.resample_filter = resample_filter
+        self.resample_mode = resample_mode
+        self.num_heads = out_channels // channels_per_head if attention else 0
+        self.dropout = dropout
+        self.res_balance = res_balance
+        self.attn_balance = attn_balance
+        self.clip_act = clip_act
+        self.emb_gain = torch.nn.Parameter(torch.zeros([]))
+        self.conv_res0 = MPConv(out_channels if flavor == 'enc' else in_channels, out_channels, kernel=[3,3])
+        self.emb_linear = MPConv(emb_channels, out_channels, kernel=[])
+        self.conv_res1 = MPConv(out_channels, out_channels, kernel=[3,3])
+        self.conv_skip = MPConv(in_channels, out_channels, kernel=[1,1]) if in_channels != out_channels else None
+        self.attn_qkv = MPConv(out_channels, out_channels * 3, kernel=[1,1]) if self.num_heads != 0 else None
+        self.attn_proj = MPConv(out_channels, out_channels, kernel=[1,1]) if self.num_heads != 0 else None
+    def forward(self, x, emb):
+        # Main branch.
+        x = resample(x, f=self.resample_filter, mode=self.resample_mode)
+        if self.flavor == 'enc':
+            if self.conv_skip is not None:
+                x = self.conv_skip(x)
+            x = normalize(x, dim=1) # pixel norm
+        # Residual branch.
+        y = self.conv_res0(mp_silu(x))
+        c = self.emb_linear(emb, gain=self.emb_gain) + 1
+        y = mp_silu(y * c.unsqueeze(2).unsqueeze(3).to(y.dtype))
+        if self.training and self.dropout != 0:
+            y = torch.nn.functional.dropout(y, p=self.dropout)
+        y = self.conv_res1(y)
+        # Connect the branches.
+        if self.flavor == 'dec' and self.conv_skip is not None:
+            x = self.conv_skip(x)
+        x = mp_sum(x, y, t=self.res_balance)
+        # Self-attention.
+        # Note: torch.nn.functional.scaled_dot_product_attention() could be used here,
+        # but we haven't done sufficient testing to verify that it produces identical results.
+        if self.num_heads != 0:
+            y = self.attn_qkv(x)
+            y = y.reshape(y.shape[0], self.num_heads, -1, 3, y.shape[2] * y.shape[3])
+            q, k, v = normalize(y, dim=2).unbind(3) # pixel norm & split
+            w = torch.einsum('nhcq,nhck->nhqk', q, k / np.sqrt(q.shape[2])).softmax(dim=3)
+            y = torch.einsum('nhqk,nhck->nhcq', w, v)
+            y = self.attn_proj(y.reshape(*x.shape))
+            x = mp_sum(x, y, t=self.attn_balance)
+        # Clip activations.
+        if self.clip_act is not None:
+            x = x.clip_(-self.clip_act, self.clip_act)
+        return x
+class TagEncoder(torch.nn.Module):
+    def __init__(self, din, dout):
+        super().__init__()
+        self.din = din
+        self.linear1 = MPConv(din, dout, [])
+        self.linear2 = MPConv(dout, dout, [])
+        self.out_gain = torch.nn.Parameter(torch.tensor(0.0))
+    def forward(self, x):
+        x = mp_silu(self.linear1(x))
+        return self.din**-0.5 * self.linear2(x, gain=self.out_gain)
+#----------------------------------------------------------------------------
+# EDM2 U-Net model (Figure 21).
+class UNet(torch.nn.Module):
+    def __init__(self,
+        img_resolution,                     # Image resolution.
+        img_channels,                       # Image channels.
+        label_dim,                          # Class label dimensionality. 0 = unconditional.
+        model_channels      = 192,          # Base multiplier for the number of channels.
+        channel_mult        = [1,2,3,4],    # Per-resolution multipliers for the number of channels.
+        channel_mult_noise  = None,         # Multiplier for noise embedding dimensionality. None = select based on channel_mult.
+        channel_mult_emb    = None,         # Multiplier for final embedding dimensionality. None = select based on channel_mult.
+        num_blocks          = 3,            # Number of residual blocks per resolution.
+        attn_resolutions    = [16,8],       # List of resolutions with self-attention.
+        label_balance       = 0.5,          # Balance between noise embedding (0) and class embedding (1).
+        concat_balance      = 0.5,          # Balance between skip connections (0) and main path (1).
+        **block_kwargs,                     # Arguments for Block.
+    ):
+        super().__init__()
+        cblock = [model_channels * x for x in channel_mult]
+        cnoise = model_channels * channel_mult_noise if channel_mult_noise is not None else cblock[0]
+        cemb = model_channels * channel_mult_emb if channel_mult_emb is not None else max(cblock)
+        self.label_balance = label_balance
+        self.concat_balance = concat_balance
+        self.out_gain = torch.nn.Parameter(torch.zeros([]))
+        # Embedding.
+        self.emb_fourier = MPFourier(cnoise)
+        self.emb_noise = MPConv(cnoise, cemb, kernel=[])
+        self.emb_label = TagEncoder(label_dim, cemb)
+        if type(num_blocks) is int:
+            num_blocks = [num_blocks for _ in cblock]
+        # Encoder.
+        self.enc = torch.nn.ModuleDict()
+        cout = img_channels + 1
+        for level, channels in enumerate(cblock):
+            res = img_resolution >> level
+            if level == 0:
+                cin = cout
+                cout = channels
+                self.enc[f'{res}x{res}_conv'] = MPConv(cin, cout, kernel=[3,3])
+            else:
+                self.enc[f'{res}x{res}_down'] = Block(cout, cout, cemb, flavor='enc', resample_mode='down', **block_kwargs)
+            for idx in range(num_blocks[level]):
+                cin = cout
+                cout = channels
+                self.enc[f'{res}x{res}_block{idx}'] = Block(cin, cout, cemb, flavor='enc', attention=(res in attn_resolutions), **block_kwargs)
+        # Decoder.
+        self.dec = torch.nn.ModuleDict()
+        skips = [block.out_channels for block in self.enc.values()]
+        for level, channels in reversed(list(enumerate(cblock))):
+            res = img_resolution >> level
+            if level == len(cblock) - 1:
+                self.dec[f'{res}x{res}_in0'] = Block(cout, cout, cemb, flavor='dec', attention=True, **block_kwargs)
+                self.dec[f'{res}x{res}_in1'] = Block(cout, cout, cemb, flavor='dec', **block_kwargs)
+            else:
+                self.dec[f'{res}x{res}_up'] = Block(cout, cout, cemb, flavor='dec', resample_mode='up', **block_kwargs)
+            for idx in range(num_blocks[level] + 1):
+                cin = cout + skips.pop()
+                cout = channels
+                self.dec[f'{res}x{res}_block{idx}'] = Block(cin, cout, cemb, flavor='dec', attention=(res in attn_resolutions), **block_kwargs)
+        self.out_conv = MPConv(cout, img_channels, kernel=[3,3])
+    def forward(self, x, noise_labels, class_labels):
+        # Embedding.
+        emb = self.emb_noise(self.emb_fourier(noise_labels))
+        if self.emb_label is not None:
+            emb = mp_sum(emb, self.emb_label(class_labels * np.sqrt(class_labels.shape[1])), t=self.label_balance)
+        emb = mp_silu(emb)
+        # Encoder.
+        x = torch.cat([x, torch.ones_like(x[:, :1])], dim=1)
+        skips = []
+        for name, block in self.enc.items():
+            x = block(x) if 'conv' in name else block(x, emb)
+            skips.append(x)
+        # Decoder.
+        for name, block in self.dec.items():
+            if 'block' in name:
+                x = mp_cat(x, skips.pop(), t=self.concat_balance)
+            x = block(x, emb)
+        x = self.out_conv(x, gain=self.out_gain)
+        return x
+#----------------------------------------------------------------------------
+# Preconditioning and uncertainty estimation.
+class Precond(torch.nn.Module):
+    def __init__(self,
+        img_resolution,         # Image resolution.
+        img_channels,           # Image channels.
+        label_dim,              # Class label dimensionality. 0 = unconditional.
+        use_fp16        = True, # Run the model at FP16 precision?
+        sigma_data      = 0.5,  # Expected standard deviation of the training data.
+        **unet_kwargs,          # Keyword arguments for UNet.
+    ):
+        super().__init__()
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        self.label_dim = label_dim
+        self.use_fp16 = use_fp16
+        self.sigma_data = sigma_data
+        self.unet = UNet(img_resolution=img_resolution, img_channels=img_channels, label_dim=label_dim, **unet_kwargs)
+        self.uncond_emb = torch.nn.Parameter(torch.randn((1024,)))
+    def forward(self, x, sigma, class_labels=None, force_fp32=False, return_logvar=False, **unet_kwargs):
+        x = x.to(torch.float32)
+        sigma = sigma.to(torch.float32).reshape(-1, 1, 1, 1)
+        class_labels = None if self.label_dim == 0 else torch.zeros([1, self.label_dim], device=x.device) if class_labels is None else class_labels.to(torch.float32).reshape(-1, self.label_dim)
+        dtype = torch.float16 if (self.use_fp16 and not force_fp32 and x.device.type == 'cuda') else torch.float32
+        # Preconditioning weights.
+        c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2)
+        c_out = sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2).sqrt()
+        c_in = 1 / (self.sigma_data ** 2 + sigma ** 2).sqrt()
+        c_noise = sigma.flatten().log() / 4
+        # Run the model.
+        x_in = (c_in * x).to(dtype)
+        F_x = self.unet(x_in, c_noise, class_labels, **unet_kwargs)
+        D_x = c_skip * x + c_out * F_x.to(torch.float32)
+        return D_x
+#----------------------------------------------------------------------------

lib/sampling.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import numpy as np
+def edm_sampler(
+    net,
+    noise,
+    labels=None,
+    gnet=None,
+    num_steps=32,
+    sigma_min=0.002,
+    sigma_max=80,
+    rho=7,
+    guidance=1,
+    S_churn=0,
+    S_min=0,
+    S_max=float("inf"),
+    S_noise=1,
+    dtype=torch.float32,
+    randn_like=torch.randn_like,
+):
+    # Guided denoiser.
+    def denoise(x, t):
+        Dx = net(x, t, labels).to(dtype)
+        if guidance == 1:
+            return Dx
+        ref_Dx = gnet(x, t).to(dtype)
+        return ref_Dx.lerp(Dx, guidance)
+    # Time step discretization.
+    step_indices = torch.arange(num_steps, dtype=dtype, device=noise.device)
+    t_steps = (
+        sigma_max ** (1 / rho)
+        + step_indices
+        / (num_steps - 1)
+        * (sigma_min ** (1 / rho) - sigma_max ** (1 / rho))
+    ) ** rho
+    t_steps = torch.cat([t_steps, torch.zeros_like(t_steps[:1])])  # t_N = 0
+    # Main sampling loop.
+    x_next = noise.to(dtype) * t_steps[0]
+    for i, (t_cur, t_next) in enumerate(zip(t_steps[:-1], t_steps[1:])):  # 0, ..., N-1
+        x_cur = x_next
+        # Increase noise temporarily.
+        if S_churn > 0 and S_min <= t_cur <= S_max:
+            gamma = min(S_churn / num_steps, np.sqrt(2) - 1)
+            t_hat = t_cur + gamma * t_cur
+            x_hat = x_cur + (t_hat**2 - t_cur**2).sqrt() * S_noise * randn_like(x_cur)
+        else:
+            t_hat = t_cur
+            x_hat = x_cur
+        # Euler step.
+        d_cur = (x_hat - denoise(x_hat, t_hat)) / t_hat
+        x_next = x_hat + (t_next - t_hat) * d_cur
+        # Apply 2nd order correction.
+        if i < num_steps - 1:
+            d_prime = (x_next - denoise(x_next, t_next)) / t_next
+            x_next = x_hat + (t_next - t_hat) * (0.5 * d_cur + 0.5 * d_prime)
+    return x_next

model_weights/1girl-edm-xs-test-1.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:745cdbc08a10dea8a2bffa51559f741f7010b81faa194cfcc6ea62b98ef329bf
+size 499977592

model_weights/condgen_vae_decoder.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc2e77e8584fdb207b768398fdc96693772cfbb378f4b7c6adc58fa08116cef
+size 5776840