Spaces:

Alpha-VLLM
/

Lumina-Next-T2I

Running on Zero

App Files Files Community

PommesPeter commited on May 29, 2024

Commit

15222c4

verified ·

1 Parent(s): 8a566c0

Update models/model.py

Browse files

Files changed (1) hide show

models/model.py +16 -19

models/model.py CHANGED Viewed

@@ -14,8 +14,8 @@ import torch.nn.functional as F
 logger = logging.getLogger(__name__)
-def modulate(x, shift, scale):
-    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
 #############################################################################
@@ -533,16 +533,17 @@ class TransformerBlock(nn.Module):
             ffn_dim_multiplier=ffn_dim_multiplier,
         )
         self.layer_id = layer_id
-        self.attention_norm = RMSNorm(dim, eps=norm_eps)
         self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
-        self.ffn_norm = RMSNorm(dim, eps=norm_eps)
         self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
             nn.Linear(
                 min(dim, 1024),
-                6 * dim,
                 bias=True,
             ),
         )
@@ -571,14 +572,11 @@ class TransformerBlock(nn.Module):
         """
         if adaln_input is not None:
-            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
-                self.adaLN_modulation(adaln_input).chunk(6, dim=1)
-            )
-            x = x + self.attention_norm1(
-                gate_msa.unsqueeze(1)
-                * self.attention(
-                    modulate(self.attention_norm(x), shift_msa, scale_msa),
                     x_mask,
                     freqs_cis,
                     self.attention_y_norm(y),
@@ -586,10 +584,9 @@ class TransformerBlock(nn.Module):
                 )
             )
             d = x.shape[-1]
-            x = x + self.ffn_norm1(
-                gate_mlp.unsqueeze(1)
-                * self.feed_forward(
-                    modulate(self.ffn_norm(x), shift_mlp, scale_mlp).view(-1, d),
                 ).view(*x.shape)
             )
@@ -633,14 +630,14 @@ class ParallelFinalLayer(nn.Module):
             nn.SiLU(),
             nn.Linear(
                 min(hidden_size, 1024),
-                2 * hidden_size,
                 bias=True,
             ),
         )
     def forward(self, x, c):
-        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
-        x = modulate(self.norm_final(x), shift, scale)
         x = self.linear(x)
         return x

 logger = logging.getLogger(__name__)
+def modulate(x, scale):
+    return x * (1 + scale.unsqueeze(1))
 #############################################################################
             ffn_dim_multiplier=ffn_dim_multiplier,
         )
         self.layer_id = layer_id
         self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
         self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
             nn.Linear(
                 min(dim, 1024),
+                2 * dim,
                 bias=True,
             ),
         )
         """
         if adaln_input is not None:
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
+            x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
+                self.attention(
+                    modulate(self.attention_norm1(x), scale_msa),
                     x_mask,
                     freqs_cis,
                     self.attention_y_norm(y),
                 )
             )
             d = x.shape[-1]
+            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
+                self.feed_forward(
+                    modulate(self.ffn_norm1(x), scale_mlp).view(-1, d),
                 ).view(*x.shape)
             )
             nn.SiLU(),
             nn.Linear(
                 min(hidden_size, 1024),
+                hidden_size,
                 bias=True,
             ),
         )
     def forward(self, x, c):
+        scale = self.adaLN_modulation(c)
+        x = modulate(self.norm_final(x), scale)
         x = self.linear(x)
         return x