Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 27, 2024

Commit

f5e1b24

verified ·

1 Parent(s): 5454cb2

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +22 -53

modeling_quiet.py CHANGED Viewed

@@ -1071,58 +1071,29 @@ class QuietModel(QuietPreTrainedModel):
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
-            if self._attn_implementation == "flash_attention_2":
-                # 2d mask is passed through the layers
-                attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-            elif self._attn_implementation == "sdpa" and not output_attentions:
-                if attention_mask.dim() == 2:
-                    # output_attentions=True can not be supported when using SDPA, and we fall back on
-                    # the manual implementation that requires a 4D causal mask in all cases.
-                    attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                        attention_mask,
-                        (batch_size, seq_length),
-                        inputs_embeds,
-                        past_key_values_length,
-                    )
-                else:
-                    # Resize the attention mask if necessary
-                    if attention_mask.shape[-1] < seq_length:
-                        # Pad the attention mask with ones to match the sequence length
-                        padding = torch.ones(
-                            (attention_mask.shape[0], attention_mask.shape[1], attention_mask.shape[2], seq_length - attention_mask.shape[-1]),
-                            dtype=attention_mask.dtype,
-                            device=attention_mask.device
-                        )
-                        attention_mask = torch.cat([attention_mask, padding], dim=-1)
-                    elif attention_mask.shape[-1] > seq_length:
-                        # Truncate the attention mask to match the sequence length
-                        attention_mask = attention_mask[:, :, :, :seq_length]
-            else:
-                if attention_mask is None or attention_mask.dim() == 2:
-                    # 4d mask is passed through the layers
-                    attention_mask = _prepare_4d_causal_attention_mask(
-                        attention_mask,
-                        (batch_size, seq_length),
-                        inputs_embeds,
-                        past_key_values_length,
-                        sliding_window=self.config.sliding_window,
-                    )
-                else:
-                    # Resize the attention mask if necessary
-                    if attention_mask.shape[-1] < seq_length:
-                        # Pad the attention mask with ones to match the sequence length
-                        padding = torch.ones(
-                            (attention_mask.shape[0], attention_mask.shape[1], attention_mask.shape[2], seq_length - attention_mask.shape[-1]),
-                            dtype=attention_mask.dtype,
-                            device=attention_mask.device
-                        )
-                        attention_mask = torch.cat([attention_mask, padding], dim=-1)
-                    elif attention_mask.shape[-1] > seq_length:
-                        # Truncate the attention mask to match the sequence length
-                        attention_mask = attention_mask[:, :, :, :seq_length]
-            # Assign the value to hidden_states after the attention mask preparation
-            hidden_states = inputs_embeds
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
@@ -1912,7 +1883,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
                         inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
-                # Update the attention mask when new tokens are added
                 if len(attention_mask.shape) == 2:
                     breakpoint()
                 else:
@@ -1935,7 +1905,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
                         new_attention = new_attention * original_attention
                         new_attention[new_attention == 0] = attention_mask.min()
                         new_attention[new_attention == 1] = attention_mask.max()
-                    attention_mask = torch.cat([original_attention, new_attention], dim=-1)
                     attention_mask = torch.cat([attention_mask, new_attention], dim=-1)
                 past_key_values = outputs.past_key_values
                 position_ids = position_ids + 1

                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        elif attention_mask is None or attention_mask.dim() == 2:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        hidden_states = inputs_embeds
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
                         inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 if len(attention_mask.shape) == 2:
                     breakpoint()
                 else:
                         new_attention = new_attention * original_attention
                         new_attention[new_attention == 0] = attention_mask.min()
                         new_attention[new_attention == 1] = attention_mask.max()
                     attention_mask = torch.cat([attention_mask, new_attention], dim=-1)
                 past_key_values = outputs.past_key_values
                 position_ids = position_ids + 1