Fix for FlashAttention RuntimeError & Triton Multi GPU fix.

#17

by Satandon1999 - opened Jun 10

base: refs/heads/main

←

from: refs/pr/17

Discussion Files changed

+91

-88

Files changed (2) hide show

positional_embedding.py +2 -2
triton_flash_blocksparse_attn.py +89 -86

positional_embedding.py CHANGED Viewed

@@ -269,10 +269,10 @@ class RotaryEmbedding(torch.nn.Module):
         return (
             apply_rotary_pos_emb(
                 q, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
-            ),
             apply_rotary_pos_emb(
                 k, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
-            ),
         )
     @classmethod

         return (
             apply_rotary_pos_emb(
                 q, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
+            ).to(q.dtype),
             apply_rotary_pos_emb(
                 k, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
+            ).to(q.dtype),
         )
     @classmethod

triton_flash_blocksparse_attn.py CHANGED Viewed

@@ -611,30 +611,31 @@ def _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BL
     # print(f'> {q.shape=}, {k.shape=}, {layout_crow_indices.shape}, {layout_col_indices.shape}, {layout_crow_indices.stride()}, \
     #   {layout_col_indices.stride()}, {layout_crow_indices=}, {layout_col_indices=}')
-    _fwd_kernel[grid](
-        q, k, v, sm_scale,
-        layout_crow_indices,
-        layout_col_indices,
-        layout_crow_indices.stride(0), layout_crow_indices.stride(1),
-        layout_col_indices.stride(0), layout_col_indices.stride(1),
-        tmp, L, m,
-        o,
-        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
-        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
-        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
-        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
-        q.shape[0], q.shape[1], k.shape[2],
-        k.shape[2] - q.shape[2],
-        q_rounded_len,
-        BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
-        BLOCK_DMODEL=BLOCK_DMODEL,
-        EVEN_M_BLOCK=q.shape[2] % BLOCK_M == 0,
-        EVEN_N_BLOCK=k.shape[2] % BLOCK_N == 0 ,
-        INFERENCE=inference,
-        NUM_DBLOCKS=q.shape[-1] // BLOCK_DMODEL,
-        num_warps=num_warps,
-        num_stages=num_stages,
-    )
     if inference:
         L, m = None, None
@@ -991,37 +992,38 @@ def blocksparse_flash_attn_padded_fwd(
     grid = (len(q_start_sids), n_heads)
-    _fwd_kernel_batch_inference[grid](
-    q, k, v, out,
-    sm_scale,
-    q_batch_starts,
-    q_batch_ends,
-    k_batch_starts,
-    k_batch_ends,
-    q_batch_ids,
-    q_start_sids,
-    *q.stride(),
-    *k.stride(),
-    *v.stride(),
-    *out.stride(),
-    layout_crow_indices,
-    layout_col_indices,
-    *layout_crow_indices.stride(),
-    *layout_col_indices.stride(),
-    q_k_ratio,
-    HAS_BATCH_DIM = True,
-    D_HEAD = head_size,
-    BLOCK_M = block_size,
-    BLOCK_N = block_size,
-    BLOCK_D = block_d,
-    BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
-    EVEN_D = block_d == head_size,
-    num_warps = 1 if q_len == 1 else 4,
-    num_stages = 3
-    )
     return out
@@ -1093,37 +1095,38 @@ def blocksparse_flash_attn_varlen_fwd(
     grid = (len(q_start_sids), n_heads)
-    _fwd_kernel_batch_inference[grid](
-    q, k, v, out,
-    sm_scale,
-    cu_seqlens_q[:-1],
-    cu_seqlens_q[1:],
-    cu_seqlens_k[:-1],
-    cu_seqlens_k[1:],
-    q_batch_ids,
-    q_start_sids,
-    0, *q.stride(),
-    0, *k.stride(),
-    0, *v.stride(),
-    0, *out.stride(),
-    layout_crow_indices,
-    layout_col_indices,
-    *layout_crow_indices.stride(),
-    *layout_col_indices.stride(),
-    q_k_ratio,
-    HAS_BATCH_DIM = False,
-    D_HEAD = head_size,
-    BLOCK_M = block_size,
-    BLOCK_N = block_size,
-    BLOCK_D = block_d,
-    BLOCK_M_LOADING = 16 if decoding_only else block_size, # smaller for decoding
-    EVEN_D = block_d == head_size,
-    num_warps = 1 if decoding_only else 4,
-    num_stages = 3
-    )
     return out

     # print(f'> {q.shape=}, {k.shape=}, {layout_crow_indices.shape}, {layout_col_indices.shape}, {layout_crow_indices.stride()}, \
     #   {layout_col_indices.stride()}, {layout_crow_indices=}, {layout_col_indices=}')
+    with torch.cuda.device(q.device.index):
+        _fwd_kernel[grid](
+            q, k, v, sm_scale,
+            layout_crow_indices,
+            layout_col_indices,
+            layout_crow_indices.stride(0), layout_crow_indices.stride(1),
+            layout_col_indices.stride(0), layout_col_indices.stride(1),
+            tmp, L, m,
+            o,
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+            v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+            o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+            q.shape[0], q.shape[1], k.shape[2],
+            k.shape[2] - q.shape[2],
+            q_rounded_len,
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL=BLOCK_DMODEL,
+            EVEN_M_BLOCK=q.shape[2] % BLOCK_M == 0,
+            EVEN_N_BLOCK=k.shape[2] % BLOCK_N == 0 ,
+            INFERENCE=inference,
+            NUM_DBLOCKS=q.shape[-1] // BLOCK_DMODEL,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
     if inference:
         L, m = None, None
     grid = (len(q_start_sids), n_heads)
+    with torch.cuda.device(q.device.index):
+        _fwd_kernel_batch_inference[grid](
+        q, k, v, out,
+        sm_scale,
+        q_batch_starts,
+        q_batch_ends,
+        k_batch_starts,
+        k_batch_ends,
+        q_batch_ids,
+        q_start_sids,
+        *q.stride(),
+        *k.stride(),
+        *v.stride(),
+        *out.stride(),
+        layout_crow_indices,
+        layout_col_indices,
+        *layout_crow_indices.stride(),
+        *layout_col_indices.stride(),
+        q_k_ratio,
+        HAS_BATCH_DIM = True,
+        D_HEAD = head_size,
+        BLOCK_M = block_size,
+        BLOCK_N = block_size,
+        BLOCK_D = block_d,
+        BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
+        EVEN_D = block_d == head_size,
+        num_warps = 1 if q_len == 1 else 4,
+        num_stages = 3
+        )
     return out
     grid = (len(q_start_sids), n_heads)
+    with torch.cuda.device(q.device.index):
+        _fwd_kernel_batch_inference[grid](
+        q, k, v, out,
+        sm_scale,
+        cu_seqlens_q[:-1],
+        cu_seqlens_q[1:],
+        cu_seqlens_k[:-1],
+        cu_seqlens_k[1:],
+        q_batch_ids,
+        q_start_sids,
+        0, *q.stride(),
+        0, *k.stride(),
+        0, *v.stride(),
+        0, *out.stride(),
+        layout_crow_indices,
+        layout_col_indices,
+        *layout_crow_indices.stride(),
+        *layout_col_indices.stride(),
+        q_k_ratio,
+        HAS_BATCH_DIM = False,
+        D_HEAD = head_size,
+        BLOCK_M = block_size,
+        BLOCK_N = block_size,
+        BLOCK_D = block_d,
+        BLOCK_M_LOADING = 16 if decoding_only else block_size, # smaller for decoding
+        EVEN_D = block_d == head_size,
+        num_warps = 1 if decoding_only else 4,
+        num_stages = 3
+        )
     return out