Crystalcareai
/

Quiet-Star-Custom

Text Generation

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

e7aeafc

·

verified ·

1 Parent(s): 03cf46d

Update modeling_quiet.py

Files changed (1) hide show

modeling_quiet.py +4 -4

modeling_quiet.py CHANGED Viewed

@@ -167,18 +167,18 @@ class QuietRMSNorm(nn.Module):
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Quiet
 class QuietRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
         self.base = base
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
         )
     def _set_cos_sin_cache(self, seq_len, device, dtype):
@@ -186,7 +186,6 @@ class QuietRotaryEmbedding(nn.Module):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
@@ -313,6 +312,7 @@ class QuietAttention(nn.Module):
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
         )
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):

 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Quiet
 class QuietRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, max_thought_tokens=2):
         super().__init__()
         self.dim = dim
+        self.max_position_embeddings = max_position_embeddings + max_thought_tokens
         self.base = base
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(
+            seq_len=max_position_embeddings + max_thought_tokens, device=self.inv_freq.device, dtype=torch.get_default_dtype()
         )
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(t, self.inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
+            max_thought_tokens=2,
         )
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):