fixed flash_attention backward_compat

Browse files

Files changed (2) hide show

modeling_decilm.py +0 -2
transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py +48 -2

modeling_decilm.py CHANGED Viewed

@@ -385,7 +385,6 @@ class DeciLMAttention(nn.Module):
             **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
         if self.config.pretraining_tp > 1:
             key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
             query_slices = self.q_proj.weight.split(
@@ -497,7 +496,6 @@ class DeciLMFlashAttention2(DeciLMAttention):
                 "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
                 "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
             )
         output_attentions = False
         bsz, q_len, _ = hidden_states.size()

             **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
         if self.config.pretraining_tp > 1:
             key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
             query_slices = self.q_proj.weight.split(
                 "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
                 "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
             )
         output_attentions = False
         bsz, q_len, _ = hidden_states.size()

transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py CHANGED Viewed

@@ -15,12 +15,18 @@
 import inspect
 import os
-from typing import Optional, Tuple
 import torch
 import torch.nn.functional as F
-from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal
 if is_flash_attn_2_available():
@@ -32,6 +38,46 @@ if is_flash_attn_2_available():
         raise "Unable to import flash_attn"
 def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
     """
     Retrieves indexing data required to repad unpadded (ragged) tensors.

 import inspect
 import os
+from typing import Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
+from functools import lru_cache
+import importlib.metadata
+import importlib.util
+from packaging import version
+from transformers.utils import is_flash_attn_2_available
 if is_flash_attn_2_available():
         raise "Unable to import flash_attn"
+def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]:
+    # Check if the package spec exists and grab its version to avoid importing a local directory
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+    if package_exists:
+        try:
+            # Primary method to get the package version
+            package_version = importlib.metadata.version(pkg_name)
+        except importlib.metadata.PackageNotFoundError:
+            # Fallback method: Only for "torch" and versions containing "dev"
+            if pkg_name == "torch":
+                try:
+                    package = importlib.import_module(pkg_name)
+                    temp_version = getattr(package, "__version__", "N/A")
+                    # Check if the version contains "dev"
+                    if "dev" in temp_version:
+                        package_version = temp_version
+                        package_exists = True
+                    else:
+                        package_exists = False
+                except ImportError:
+                    # If the package can't be imported, it's not available
+                    package_exists = False
+            else:
+                # For packages other than "torch", don't attempt the fallback and set as not available
+                package_exists = False
+    if return_version:
+        return package_exists, package_version
+    else:
+        return package_exists
+@lru_cache()
+def is_flash_attn_greater_or_equal(library_version: str):
+    if not _is_package_available("flash_attn"):
+        return False
+    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(library_version)
 def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
     """
     Retrieves indexing data required to repad unpadded (ragged) tensors.