Qwen
/

Qwen-1_8B-Chat-Int8

Text Generation

8-bit precision

Model card Files Files and versions Community

xingzhang commited on Dec 7, 2023

Commit

4572596

•

1 Parent(s): dfa6051

update modeling_qwen.py

Files changed (1) hide show

modeling_qwen.py +1 -1

modeling_qwen.py CHANGED Viewed

@@ -520,7 +520,7 @@ class QWenAttention(nn.Module):
             if not self.use_cache_quantization and SUPPORT_TORCH2:
                 if attention_mask is not None:
-                    attention_mask = attention_mask.expand(-1, -1, key_size, -1)
                     if causal_mask is not None:
                         attention_mask = attention_mask.masked_fill(~causal_mask, torch.finfo(query.dtype).min)
                 else:

             if not self.use_cache_quantization and SUPPORT_TORCH2:
                 if attention_mask is not None:
+                    attention_mask = attention_mask.expand(-1, -1, query.size(2), -1)
                     if causal_mask is not None:
                         attention_mask = attention_mask.masked_fill(~causal_mask, torch.finfo(query.dtype).min)
                 else: