Markus28 commited on
Commit
e6bd226
1 Parent(s): f669876

fix: use attention dropout during training

Browse files
Files changed (1) hide show
  1. modeling_bert.py +2 -1
modeling_bert.py CHANGED
@@ -357,7 +357,8 @@ class JinaBertSelfAttention(nn.Module):
357
  if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
358
  b, _, s, _ = query_layer.shape
359
  new_bias = attention_mask + bias
360
- attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias)
 
361
  attn = attn.permute(0, 2, 1, 3).contiguous()
362
  return (attn.view(b, s, self.all_head_size),)
363
 
 
357
  if self.attn_implementation == 'torch' and scaled_dot_product_attention is not None:
358
  b, _, s, _ = query_layer.shape
359
  new_bias = attention_mask + bias
360
+ dropout_p = self.dropout.p if self.training else 0.0
361
+ attn = scaled_dot_product_attention(query_layer, key_layer, value_layer, new_bias, dropout_p=dropout_p)
362
  attn = attn.permute(0, 2, 1, 3).contiguous()
363
  return (attn.view(b, s, self.all_head_size),)
364