feat: implement task type embeddings

by Markus28 - opened Feb 29, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+18

-2

Files changed (2) hide show

configuration_bert.py +4 -0
modeling_bert.py +14 -2

configuration_bert.py CHANGED Viewed

@@ -81,6 +81,8 @@ class JinaBertConfig(PretrainedConfig):
         fused_dropout_add_ln=False,
         fused_bias_fc=False,
         pad_vocab_size_multiple=1,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs
@@ -106,3 +108,5 @@ class JinaBertConfig(PretrainedConfig):
         self.fused_dropout_add_ln = fused_dropout_add_ln
         self.fused_bias_fc = fused_bias_fc
         self.pad_vocab_size_multiple = pad_vocab_size_multiple

         fused_dropout_add_ln=False,
         fused_bias_fc=False,
         pad_vocab_size_multiple=1,
+        num_tasks=0,
+        use_flash_attn=True,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs
         self.fused_dropout_add_ln = fused_dropout_add_ln
         self.fused_bias_fc = fused_bias_fc
         self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.num_tasks = num_tasks
+        self.use_flash_attn = use_flash_attn

modeling_bert.py CHANGED Viewed

@@ -59,6 +59,7 @@ logger = logging.getLogger(__name__)
 def create_mixer_cls(config, cross_attn=False, return_residual=False):
     fused_bias_fc = getattr(config, "fused_bias_fc", False)
     window_size = getattr(config, "window_size", (-1, -1))
     mixer_cls = partial(
@@ -68,7 +69,7 @@ def create_mixer_cls(config, cross_attn=False, return_residual=False):
         dropout=config.attention_probs_dropout_prob,
         causal=False,
         fused_bias_fc=fused_bias_fc,
-        use_flash_attn=True,
         return_residual=return_residual,
         use_alibi=True,
         window_size=window_size,
@@ -151,6 +152,7 @@ def _init_weights(module, initializer_range=0.02):
 class BertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
@@ -171,7 +173,7 @@ class BertEncoder(nn.Module):
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
-        if key_padding_mask is None:
             mixer_kwargs = (
                 {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
             )
@@ -340,14 +342,21 @@ class BertModel(BertPreTrainedModel):
         self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config) if add_pooling_layer else None
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     def forward(
         self,
         input_ids,
         position_ids=None,
         token_type_ids=None,
         attention_mask=None,
         masked_tokens_mask=None,
     ):
@@ -359,6 +368,9 @@ class BertModel(BertPreTrainedModel):
         hidden_states = self.embeddings(
             input_ids, position_ids=position_ids, token_type_ids=token_type_ids
         )
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
         if not self.fused_dropout_add_ln:

 def create_mixer_cls(config, cross_attn=False, return_residual=False):
+    use_flash_attn = getattr(config, "use_flash_attn", False)
     fused_bias_fc = getattr(config, "fused_bias_fc", False)
     window_size = getattr(config, "window_size", (-1, -1))
     mixer_cls = partial(
         dropout=config.attention_probs_dropout_prob,
         causal=False,
         fused_bias_fc=fused_bias_fc,
+        use_flash_attn=use_flash_attn,
         return_residual=return_residual,
         use_alibi=True,
         window_size=window_size,
 class BertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
+        self.use_flash_attn = getattr(config, "use_flash_attn", False)
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
+        if key_padding_mask is None or not self.use_flash_attn:
             mixer_kwargs = (
                 {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
             )
         self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.task_type_embeddings = nn.Embedding(config.num_tasks, config.hidden_size)
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+        # We now initialize the task embeddings to 0; We do not use task types during
+        # pretraining. When we start using task types during embedding training,
+        # we want the model to behave exactly as in pretraining (i.e. task types
+        # have no effect).
+        nn.init.zeros_(self.task_type_embeddings.weight)
     def forward(
         self,
         input_ids,
         position_ids=None,
         token_type_ids=None,
+        task_type_ids=None,
         attention_mask=None,
         masked_tokens_mask=None,
     ):
         hidden_states = self.embeddings(
             input_ids, position_ids=position_ids, token_type_ids=token_type_ids
         )
+        if task_type_ids is not None:
+            hidden_states = hidden_states + self.task_type_embeddings(task_type_ids)
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
         if not self.fused_dropout_add_ln: