baichuan-inc
/

Baichuan-13B-Chat

@@ -249,7 +249,8 @@ class BaichuanModel(BaichuanPreTrainedModel):
         self.gradient_checkpointing = config.gradient_checkpointing
         self.post_init()
         self.max_cache_pos = config.model_max_length
-        self.first_run = True
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -306,8 +307,13 @@ class BaichuanModel(BaichuanPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        alibi_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past)
         if attention_mask is not None:
             if len(attention_mask.shape) == 2:
                 expanded_mask = attention_mask.to(alibi_mask.dtype)

         self.gradient_checkpointing = config.gradient_checkpointing
         self.post_init()
         self.max_cache_pos = config.model_max_length
+        self.first_run = True
+        self.alibi_mask = None
     def get_input_embeddings(self):
         return self.embed_tokens
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        if self.training:
+            if self.alibi_mask is None or self.alibi_mask.shape[-1] != seq_length_with_past:
+                self.alibi_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past)
+            alibi_mask = self.alibi_mask
+        else:
+            alibi_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past)
         if attention_mask is not None:
             if len(attention_mask.shape) == 2:
                 expanded_mask = attention_mask.to(alibi_mask.dtype)