Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26, 2024

Commit

9172f24

verified ·

1 Parent(s): e7aeafc

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +55 -40

modeling_quiet.py CHANGED Viewed

@@ -147,6 +147,8 @@ def _get_unpad_data(attention_mask):
         cu_seqlens,
         max_seqlen_in_batch,
     )
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Quiet
 class QuietRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
@@ -167,18 +169,18 @@ class QuietRMSNorm(nn.Module):
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Quiet
 class QuietRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, max_thought_tokens=2):
         super().__init__()
         self.dim = dim
-        self.max_position_embeddings = max_position_embeddings + max_thought_tokens
         self.base = base
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(
-            seq_len=max_position_embeddings + max_thought_tokens, device=self.inv_freq.device, dtype=torch.get_default_dtype()
         )
     def _set_cos_sin_cache(self, seq_len, device, dtype):
@@ -186,6 +188,7 @@ class QuietRotaryEmbedding(nn.Module):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(t, self.inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
@@ -231,18 +234,13 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    print(f"cos shape: {cos.shape}")
-    print(f"position_ids shape: {position_ids.shape}")
-    print(f"position_ids values: {position_ids}")
-    print(f"unsqueeze_dim: {unsqueeze_dim}")
-    assert torch.all(position_ids >= 0), "position_ids must be non-negative"
-    assert torch.all(position_ids < cos.shape[0]), f"position_ids must be less than the size of cos ({cos.shape[0]})"
     cos = cos[position_ids].unsqueeze(unsqueeze_dim)
     sin = sin[position_ids].unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 class QuietMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -283,8 +281,8 @@ class QuietAttention(nn.Module):
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )
@@ -312,7 +310,6 @@ class QuietAttention(nn.Module):
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
-            max_thought_tokens=2,
         )
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
@@ -370,36 +367,54 @@ class QuietAttention(nn.Module):
                 f" {attn_weights.size()}"
             )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
 class QuietFlashAttention2(QuietAttention):
     """
     Quiet flash attention module. This module inherits from `QuietAttention` as the weights of the module stays
@@ -576,7 +591,7 @@ class QuietFlashAttention2(QuietAttention):
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
@@ -694,7 +709,8 @@ class QuietFlashAttention2(QuietAttention):
         )
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
 class QuietSdpaAttention(QuietAttention):
     """
     Quiet attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -768,14 +784,14 @@ class QuietSdpaAttention(QuietAttention):
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask.to(query_states.device) if attention_mask is not None else None,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
@@ -1095,7 +1111,7 @@ class QuietModel(QuietPreTrainedModel):
                 past_key_values_length,
                 sliding_window=self.config.sliding_window,
             )
-        print(f"Prepared 4D causal attention mask. Shape: {attention_mask.shape}")
         hidden_states = inputs_embeds
         # decoder layers
@@ -1318,11 +1334,16 @@ class QuietForCausalLM(QuietPreTrainedModel):
         original_input_ids = input_ids.clone()
         original_attention_mask = attention_mask.clone() if attention_mask is not None else None
         # Append the start thought token to the input sequence
         start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
         input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
         seq_len += 1
         # Update the attention mask
         if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
@@ -1344,7 +1365,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-            print(f"Passing attention mask to the model. Shape: {attention_mask.shape}")
             new_key_values = outputs.past_key_values
             hidden_states = outputs[0]
@@ -1365,10 +1385,15 @@ class QuietForCausalLM(QuietPreTrainedModel):
                 attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
         end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
         input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
         seq_len += 1
         # Update the attention mask
         if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
@@ -1603,6 +1628,8 @@ class QuietForCausalLM(QuietPreTrainedModel):
             base_embeddings = self.model.embed_tokens.weight
             if self.train_only_thinking_embedding:
                 base_embeddings = base_embeddings.detach()
         # # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         fwd_iters = 1 if self.original_mode else self.n_ahead + self.n_ahead_talk - 1
         for ahead_idx in range(fwd_iters):
@@ -1882,9 +1909,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
                 if len(attention_mask.shape) == 2:
                     breakpoint()
                 else:
-                    original_attention = attention_mask[..., :attention_mask.shape[-2], :attention_mask.shape[-1]]
-                    print(f"Original attention shape: {original_attention.shape}")
                     if self.use_upper_triangular:
                         new_attention = original_attention
                     else:
@@ -1900,20 +1925,10 @@ class QuietForCausalLM(QuietPreTrainedModel):
                             ).to(attention_mask.dtype)
                         new_attention = new_attention.view(1, 1, seq_len, seq_len).repeat(input_ids.shape[0], 1, 1, 1)
-                        print(f"New attention shape: {new_attention.shape}")
                         new_attention = new_attention * original_attention
                         new_attention[new_attention == 0] = attention_mask.min()
                         new_attention[new_attention == 1] = attention_mask.max()
-                    print(f"Original attention shape before concatenation: {original_attention.shape}")
-                    print(f"New attention shape before concatenation: {new_attention.shape}")
-                    if self.use_upper_triangular:
-                        attention_mask = original_attention
-                    else:
-                        attention_mask = new_attention
-                    print(f"Attention mask shape after concatenation: {attention_mask.shape}")
                 past_key_values = outputs.past_key_values
                 position_ids = position_ids + 1

         cu_seqlens,
         max_seqlen_in_batch,
     )
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Quiet
 class QuietRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Quiet
 class QuietRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
         self.base = base
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         # Build here to make `torch.jit.trace` work.
         self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
         )
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
     cos = cos[position_ids].unsqueeze(unsqueeze_dim)
     sin = sin[position_ids].unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 class QuietMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.layer_idx = layer_idx
         if layer_idx is None:
             logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                 "when creating this class."
             )
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
             base=self.rope_theta,
         )
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
                 f" {attn_weights.size()}"
             )
+        print("Before applying attention mask:")
+        print("attention_mask shape:", attention_mask.shape if attention_mask is not None else None)
+        print("attn_weights shape:", attn_weights.shape)
         if attention_mask is not None:
+            print("Applying attention mask")
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights + attention_mask
+        print("After applying attention mask:")
+        print("attn_weights shape:", attn_weights.shape)
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        print("After softmax:")
+        print("attn_weights shape:", attn_weights.shape)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        print("After dropout:")
+        print("attn_weights shape:", attn_weights.shape)
         attn_output = torch.matmul(attn_weights, value_states)
+        print("After matmul with value states:")
+        print("attn_output shape:", attn_output.shape)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
+        print("Final attn_output shape:", attn_output.shape)
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
 class QuietFlashAttention2(QuietAttention):
     """
     Quiet flash attention module. This module inherits from `QuietAttention` as the weights of the module stays
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         )
+# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Quiet
+# TODO @Arthur no longer copied from LLama after static cache
 class QuietSdpaAttention(QuietAttention):
     """
     Quiet attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
             query_states,
             key_states,
             value_states,
+            attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
                 past_key_values_length,
                 sliding_window=self.config.sliding_window,
             )
         hidden_states = inputs_embeds
         # decoder layers
         original_input_ids = input_ids.clone()
         original_attention_mask = attention_mask.clone() if attention_mask is not None else None
+        # Append the start thought token to the input sequence
         # Append the start thought token to the input sequence
         start_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|startthought|>")
         input_ids = torch.cat([input_ids, torch.tensor([[start_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
         seq_len += 1
+        # Update the position_ids tensor
+        position_ids = position_ids[:, :-1]  # Remove the last position
+        position_ids = torch.cat([position_ids, torch.full((batch_size, 1), seq_len - 1, dtype=torch.long, device=position_ids.device)], dim=-1)
         # Update the attention mask
         if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
             new_key_values = outputs.past_key_values
             hidden_states = outputs[0]
                 attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
+# Append the end thought token to the input sequence
         end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
         input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
         seq_len += 1
+        # Update the position_ids tensor
+        position_ids = position_ids[:, :-1]  # Remove the last position
+        position_ids = torch.cat([position_ids, torch.full((batch_size, 1), seq_len - 1, dtype=torch.long, device=position_ids.device)], dim=-1)
         # Update the attention mask
         if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
             base_embeddings = self.model.embed_tokens.weight
             if self.train_only_thinking_embedding:
                 base_embeddings = base_embeddings.detach()
+            if position_ids is None:
+                position_ids = torch.arange(seq_len, dtype=torch.long, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
         # # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         fwd_iters = 1 if self.original_mode else self.n_ahead + self.n_ahead_talk - 1
         for ahead_idx in range(fwd_iters):
                 if len(attention_mask.shape) == 2:
                     breakpoint()
                 else:
+                    original_attention = attention_mask[..., :attention_mask.shape[-2]]
                     if self.use_upper_triangular:
                         new_attention = original_attention
                     else:
                             ).to(attention_mask.dtype)
                         new_attention = new_attention.view(1, 1, seq_len, seq_len).repeat(input_ids.shape[0], 1, 1, 1)
                         new_attention = new_attention * original_attention
                         new_attention[new_attention == 0] = attention_mask.min()
                         new_attention[new_attention == 1] = attention_mask.max()
+                    attention_mask = torch.cat([attention_mask, new_attention], dim=-1)
                 past_key_values = outputs.past_key_values
                 position_ids = position_ids + 1