baichuan-inc
/

Baichuan2-13B-Chat

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

Update modeling_baichuan.py

#12

by JaheimLee - opened Sep 13, 2023

base: refs/heads/main

←

from: refs/pr/12

Discussion Files changed

Files changed (1) hide show

modeling_baichuan.py +10 -7

modeling_baichuan.py CHANGED Viewed

@@ -30,7 +30,8 @@ except ImportError:
     logger.warning(
         "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers."
     )
 def _get_interleave(n):
     def _get_interleave_power_of_2(n):
@@ -173,12 +174,14 @@ class BaichuanAttention(torch.nn.Module):
         past_key_value = (key_states, value_states) if use_cache else None
         if xops is not None and self.training:
             attn_weights = None
-            # query_states = query_states.transpose(1, 2)
-            # key_states = key_states.transpose(1, 2)
-            # value_states = value_states.transpose(1, 2)
-            # attn_output = xops.memory_efficient_attention(
-            #     query_states, key_states, value_states, attn_bias=attention_mask
-            # )
             with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
             attn_output = attn_output.transpose(1, 2)

     logger.warning(
         "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers."
     )
+pytorch_major_version = int(torch.__version__.split('.')[0])
 def _get_interleave(n):
     def _get_interleave_power_of_2(n):
         past_key_value = (key_states, value_states) if use_cache else None
         if xops is not None and self.training:
             attn_weights = None
+            query_states = query_states.transpose(1, 2)
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+            attn_output = xops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=attention_mask
+            )
+        elif pytorch_major_version >= 2:
+            attn_weights = None
             with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
                 attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask = attention_mask)
             attn_output = attn_output.transpose(1, 2)