multimodalart
/

Florence-2-large-no-flash-attn

Image-Text-to-Text

PyTorch

florence2

vision

custom_code

Model card Files Files and versions Community

multimodalart HF staff commited on Aug 29

Commit

e54ca32

•

1 Parent(s): 0f23f59

Update modeling_florence2.py

Browse files

Files changed (1) hide show

modeling_florence2.py +46 -32

modeling_florence2.py CHANGED Viewed

@@ -33,11 +33,8 @@ from transformers.utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
     logging,
     replace_return_docstrings,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
 )
 from .configuration_florence2 import Florence2Config
 from .configuration_florence2 import Florence2LanguageConfig
@@ -58,9 +55,52 @@ from transformers.modeling_outputs import (
     Seq2SeqModelOutput,
 )
-if is_flash_attn_2_available():
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 logger = logging.get_logger(__name__)
@@ -1049,36 +1089,10 @@ class Florence2FlashAttention2(Florence2Attention):
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
                 query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal

     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
 )
 from .configuration_florence2 import Florence2Config
 from .configuration_florence2 import Florence2LanguageConfig
     Seq2SeqModelOutput,
 )
+def flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False):
+    # Standard scaled dot-product attention
+    d_k = q.size(-1)
+    scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=q.dtype))
+    if causal:
+        mask = torch.triu(torch.ones_like(scores), diagonal=1)
+        scores = scores.masked_fill(mask.bool(), float('-inf'))
+    attn = F.softmax(scores, dim=-1)
+    if dropout_p > 0:
+        attn = F.dropout(attn, p=dropout_p)
+    return torch.matmul(attn, v)
+def flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p=0.0, softmax_scale=None, causal=False):
+    # For simplicity, we'll just call the non-varlen version
+    return flash_attn_func(q, k, v, dropout_p, softmax_scale, causal)
+# Dummy classes to mimic flash_attn.bert_padding
+class DummyIndexFirstAxis:
+    @staticmethod
+    def __call__(x, index):
+        return x[index]
+class DummyPadInput:
+    @staticmethod
+    def __call__(x, indices, batch_size, seqlen):
+        return x
+class DummyUnpadInput:
+    @staticmethod
+    def __call__(x, indices):
+        return x, indices, x.shape[1]
+index_first_axis = DummyIndexFirstAxis()
+pad_input = DummyPadInput()
+unpad_input = DummyUnpadInput()
+def is_flash_attn_2_available():
+    return True  # Always return True
+# Replace the is_flash_attn_greater_or_equal_2_10 function
+def is_flash_attn_greater_or_equal_2_10():
+    return True  # Always return True
 logger = logging.get_logger(__name__)
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
+            return super()._flash_attn_forward(query_states, key_states, value_states, attention_mask, query_length, dropout, softmax_scale)
         else:
             attn_output = flash_attn_func(
                 query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal