das

by jupyterjazz - opened Mar 12

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+136

-501

Files changed (9) hide show

README.md +0 -33
config.json +0 -40
configuration_bert.py +6 -6
convert_v2_weights.py +0 -151
mha.py +0 -4
mlp.py +0 -47
modeling_bert.py +27 -145
modeling_lora.py +15 -75
tokenizer.py +88 -0

README.md DELETED Viewed

@@ -1,33 +0,0 @@
-# BERT with Flash-Attention
-### Installing dependencies
-To run the model on GPU, you need to install Flash Attention.
-You may either install from pypi (which may not work with fused-dense), or from source.
-To install from source, clone the GitHub repository:
-```console
-git clone git@github.com:Dao-AILab/flash-attention.git
-```
-The code provided here should work with commit `43950dd`.
-Change to the cloned repo and install:
-```console
-cd flash-attention && python setup.py install
-```
-This will compile the flash-attention kernel, which will take some time.
-If you would like to use fused MLPs (e.g. to use activation checkpointing),
-you may install fused-dense also from source:
-```console
-cd csrc/fused_dense_lib && python setup.py install
-```
-### Configuration
-The config adds some new parameters:
-- `use_flash_attn`: If `True`, always use flash attention. If `None`, use flash attention when GPU is available. If `False`, never use flash attention (works on CPU).
-- `window_size`: Size (left and right) of the local attention window. If `(-1, -1)`, use global attention
-- `dense_seq_output`: If true, we only need to pass the hidden states for the masked out token (around 15%) to the classifier heads. I set this to true for pretraining.
-- `fused_mlp`: Whether to use fused-dense. Useful to reduce VRAM in combination with activation checkpointing
-- `mlp_checkpoint_lvl`: One of `{0, 1, 2}`. Increasing this increases the amount of activation checkpointing within the MLP. Keep this at 0 for pretraining and use gradient accumulation instead. For embedding training, increase this as much as needed.
-- `last_layer_subset`: If true, we only need the compute the last layer for a subset of tokens. I left this to false.
-- `use_qk_norm`: Whether or not to use QK-normalization
-- `num_loras`: Number of LoRAs to use when initializing a `BertLoRA` model. Has no effect on other models.

config.json DELETED Viewed

@@ -1,40 +0,0 @@
-{
-  "_name_or_path": "jinaai/jina-bert-flash-implementation",
-  "auto_map": {
-    "AutoConfig": "jinaai/jina-bert-flash-implementation--configuration_bert.JinaBertConfig",
-    "AutoModel": "jinaai/jina-bert-flash-implementation--modeling_bert.BertModel",
-    "AutoModelForPreTraining": "jinaai/jina-bert-flash-implementation--modeling_bert.BertForPreTraining",
-    "AutoModelForMaskedLM": "jinaai/jina-bert-flash-implementation--modeling_bert.BertForPreTraining"
-  },
-  "attention_probs_dropout_prob": 0.1,
-  "classifier_dropout": null,
-  "dense_seq_output": false,
-  "emb_pooler": null,
-  "fused_bias_fc": false,
-  "fused_dropout_add_ln": false,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "last_layer_subset": false,
-  "layer_norm_eps": 1e-12,
-  "mlp_checkpoint_lvl": 0,
-  "mlp_type": "glu",
-  "model_type": "bert",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "num_loras": 5,
-  "pad_token_id": 0,
-  "pad_vocab_size_multiple": 1,
-  "torch_dtype": "float16",
-  "transformers_version": "4.39.3",
-  "type_vocab_size": 2,
-  "use_flash_attn": true,
-  "use_qk_norm": false,
-  "vocab_size": 30528,
-  "window_size": [
-    -1,
-    -1
-  ]
-}

configuration_bert.py CHANGED Viewed

@@ -75,24 +75,24 @@ class JinaBertConfig(PretrainedConfig):
         pad_token_id=0,
         window_size=(-1, -1),
         dense_seq_output=False,
-        mlp_type='mlp',
         mlp_checkpoint_lvl=0,
         last_layer_subset=False,
         fused_dropout_add_ln=False,
         fused_bias_fc=False,
         pad_vocab_size_multiple=1,
         use_flash_attn=True,
         use_qk_norm=True,
         emb_pooler=None,
         classifier_dropout=None,
-        num_loras=5,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs
         assert 'max_position_embeddings' not in kwargs
         super().__init__(pad_token_id=pad_token_id, **kwargs)
-        if mlp_type == 'fused_mlp' and hidden_act not in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]:
             raise ValueError('Fused MLP only supports approximate gelu')
         self.vocab_size = vocab_size
@@ -108,14 +108,14 @@ class JinaBertConfig(PretrainedConfig):
         self.layer_norm_eps = layer_norm_eps
         self.window_size = window_size
         self.dense_seq_output = dense_seq_output
-        self.mlp_type= mlp_type
         self.mlp_checkpoint_lvl = mlp_checkpoint_lvl
         self.last_layer_subset = last_layer_subset
         self.fused_dropout_add_ln = fused_dropout_add_ln
         self.fused_bias_fc = fused_bias_fc
         self.pad_vocab_size_multiple = pad_vocab_size_multiple
         self.use_flash_attn = use_flash_attn
         self.use_qk_norm = use_qk_norm
         self.emb_pooler = emb_pooler
-        self.classifier_dropout = classifier_dropout
-        self.num_loras = num_loras

         pad_token_id=0,
         window_size=(-1, -1),
         dense_seq_output=False,
+        fused_mlp=False,
         mlp_checkpoint_lvl=0,
         last_layer_subset=False,
         fused_dropout_add_ln=False,
         fused_bias_fc=False,
         pad_vocab_size_multiple=1,
+        num_tasks=0,
         use_flash_attn=True,
         use_qk_norm=True,
         emb_pooler=None,
         classifier_dropout=None,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs
         assert 'max_position_embeddings' not in kwargs
         super().__init__(pad_token_id=pad_token_id, **kwargs)
+        if fused_mlp and hidden_act not in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]:
             raise ValueError('Fused MLP only supports approximate gelu')
         self.vocab_size = vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.window_size = window_size
         self.dense_seq_output = dense_seq_output
+        self.fused_mlp = fused_mlp
         self.mlp_checkpoint_lvl = mlp_checkpoint_lvl
         self.last_layer_subset = last_layer_subset
         self.fused_dropout_add_ln = fused_dropout_add_ln
         self.fused_bias_fc = fused_bias_fc
         self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.num_tasks = num_tasks
         self.use_flash_attn = use_flash_attn
         self.use_qk_norm = use_qk_norm
         self.emb_pooler = emb_pooler
+        self.classifier_dropout = classifier_dropout

convert_v2_weights.py DELETED Viewed

@@ -1,151 +0,0 @@
-import re
-from collections import OrderedDict
-from transformers import AutoModel, AutoTokenizer
-from .configuration_bert import JinaBertConfig
-import torch
-from .modeling_bert import BertModel
-def remap_state_dict(state_dict, config: JinaBertConfig):
-    """
-    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
-    """
-    # LayerNorm
-    def key_mapping_ln_gamma_beta(key):
-        key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
-        key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
-        return key
-    state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
-    # Layers
-    def key_mapping_layers(key):
-        return re.sub(r"^encoder.layer.", "encoder.layers.", key)
-    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
-    # LayerNorm
-    def key_mapping_ln(key):
-        key = re.sub(r"^embeddings.LayerNorm.", "emb_ln.", key)
-        key = re.sub(
-            r"^encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
-            r"encoder.layers.\1.norm1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
-            r"encoder.layers.\1.norm2.\2",
-            key,
-        )
-        key = re.sub(
-            r"^cls.predictions.transform.LayerNorm.(weight|bias)",
-            r"cls.predictions.transform.layer_norm.\1",
-            key,
-        )
-        return key
-    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
-    # MLP
-    def key_mapping_mlp(key):
-        key = re.sub(
-            r"^encoder.layers.(\d+).intermediate.dense.(weight|bias)",
-            r"encoder.layers.\1.mlp.fc1.\2",
-            key,
-        )
-        key = re.sub(
-            r"^encoder.layers.(\d+).output.dense.(weight|bias)",
-            r"encoder.layers.\1.mlp.fc2.\2",
-            key,
-        )
-        return key
-    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
-    # Attention
-    last_layer_subset = getattr(config, "last_layer_subset", False)
-    for d in range(config.num_hidden_layers):
-        Wq = state_dict.pop(f"encoder.layers.{d}.attention.self.query.weight")
-        Wk = state_dict.pop(f"encoder.layers.{d}.attention.self.key.weight")
-        Wv = state_dict.pop(f"encoder.layers.{d}.attention.self.value.weight")
-        bq = state_dict.pop(f"encoder.layers.{d}.attention.self.query.bias")
-        bk = state_dict.pop(f"encoder.layers.{d}.attention.self.key.bias")
-        bv = state_dict.pop(f"encoder.layers.{d}.attention.self.value.bias")
-        if not (last_layer_subset and d == config.num_hidden_layers - 1):
-            state_dict[f"encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
-                [Wq, Wk, Wv], dim=0
-            )
-            state_dict[f"encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
-        else:
-            state_dict[f"encoder.layers.{d}.mixer.Wq.weight"] = Wq
-            state_dict[f"encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
-            state_dict[f"encoder.layers.{d}.mixer.Wq.bias"] = bq
-            state_dict[f"encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0)
-    def key_mapping_attn(key):
-        return re.sub(
-            r"^encoder.layers.(\d+).attention.output.dense.(weight|bias)",
-            r"encoder.layers.\1.mixer.out_proj.\2",
-            key,
-        )
-    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
-    def key_mapping_decoder_bias(key):
-        return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
-    state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
-    # Word embedding
-    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
-    if pad_vocab_size_multiple > 1:
-        word_embeddings = state_dict["embeddings.word_embeddings.weight"]
-        state_dict["embeddings.word_embeddings.weight"] = F.pad(
-            word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
-        )
-        decoder_weight = state_dict["cls.predictions.decoder.weight"]
-        state_dict["cls.predictions.decoder.weight"] = F.pad(
-            decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
-        )
-        # If the vocab was padded, we want to set the decoder bias for those padded indices to be
-        # strongly negative (i.e. the decoder shouldn't predict those indices).
-        # TD [2022-05-09]: I don't think it affects the MLPerf training.
-        decoder_bias = state_dict["cls.predictions.decoder.bias"]
-        state_dict["cls.predictions.decoder.bias"] = F.pad(
-            decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
-        )
-    # LayerNorm
-    def key_mapping_layernorm(key):
-        return re.sub(r'^encoder.layers.(\d+).mlp.layernorm.(weight|bias)', r"encoder.layers.\1.norm2.\2", key)
-    state_dict = OrderedDict((key_mapping_layernorm(k), v) for k, v in state_dict.items())
-    return state_dict
-v2_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
-config = JinaBertConfig(vocab_size=30528, use_qk_norm=False, mlp_type='glu', hidden_act='gelu')
-state_dict = v2_model.state_dict()
-new_state_dict = remap_state_dict(state_dict, config)
-flash_model = BertModel(config)
-flash_model.load_state_dict(new_state_dict)
-torch.save(new_state_dict, 'converted_weights.bin')
-print(config.to_json_string())
-"""
-tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en')
-inp = tokenizer.batch_encode_plus(['Hello world', 'How is the weather today?', 'It is raining a lot in  Berlin'], return_tensors='pt', padding=True).to('cuda')
-v2_model.eval()
-flash_model.eval()
-v2_model = v2_model.to('cuda', torch.float16)
-flash_model = flash_model.to('cuda', torch.float16)
-output_v2 = v2_model(**inp)
-output_flash = flash_model(**inp)
-x = output_v2.last_hidden_state
-y = output_flash.last_hidden_state
-print(torch.abs(x - y))
-"""

mha.py CHANGED Viewed

@@ -514,10 +514,6 @@ class MHA(nn.Module):
             alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device)
         else:
             alibi_slopes = None
-        if isinstance(window_size, list):
-            window_size = tuple(window_size)
         if window_size != (-1, -1):
             assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"

             alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device)
         else:
             alibi_slopes = None
         if window_size != (-1, -1):
             assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"

mlp.py CHANGED Viewed

@@ -27,53 +27,6 @@ except ImportError:
     FusedMLP, ParallelFusedMLP = None, None
-class GLUMLP(nn.Module):
-    def __init__(
-            self,
-            in_features,
-            hidden_features,
-            activation,
-            use_flash_attn,
-            return_residual=False,
-            hidden_dropout_prob=0.1
-    ):
-        super().__init__()
-        self.hidden_features = hidden_features
-        self.gated_layers = nn.Linear(
-            in_features, hidden_features * 2, bias=False
-        )
-        if activation == 'relu':
-            self.act = nn.ReLU()
-        elif activation == 'gelu':
-            self.act = nn.GELU()
-        else:
-            raise ValueError(
-                f"activation {activation} not supported"
-            )
-        self.wo = nn.Linear(hidden_features, in_features)
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-        self.return_residual = return_residual
-        self.use_flash_attn = use_flash_attn
-        #self.layernorm = nn.LayerNorm(in_features, eps=layer_norm_eps)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        residual_connection = hidden_states
-        # compute the activation
-        hidden_states = self.gated_layers(hidden_states)
-        if self.use_flash_attn:
-            gated = hidden_states[:, : self.hidden_features]
-            non_gated = hidden_states[:, self.hidden_features :]
-        else:
-            gated = hidden_states[:, :, : self.hidden_features]
-            non_gated = hidden_states[:, :, self.hidden_features :]
-        hidden_states = self.act(gated) * non_gated
-        hidden_states = self.dropout(hidden_states)
-        # multiply by the second matrix
-        hidden_states = self.wo(hidden_states)
-        # add the residual connection and post-LN
-        # hidden_states = self.layernorm(hidden_states + residual_connection)
-        return hidden_states if not self.return_residual else (hidden_states, residual_connection)
 class Mlp(nn.Module):
     def __init__(
         self,

     FusedMLP, ParallelFusedMLP = None, None
 class Mlp(nn.Module):
     def __init__(
         self,

modeling_bert.py CHANGED Viewed

@@ -39,7 +39,7 @@ from .bert_padding import (
 from .block import Block
 from .embedding import BertEmbeddings
 from .mha import MHA
-from .mlp import FusedMLP, Mlp, GLUMLP
 try:
     from flash_attn.ops.fused_dense import FusedDense
@@ -81,23 +81,19 @@ def create_mixer_cls(config, cross_attn=False, return_residual=False):
         return_residual=return_residual,
         use_alibi=True,
         window_size=window_size,
-        qk_norm=use_qk_norm,
-        checkpointing=False,
     )
     return mixer_cls
 def create_mlp_cls(config, layer_idx=None, return_residual=False):
     inner_dim = config.intermediate_size
-    mlp_type = config.mlp_type
-    assert mlp_type in ('mlp', 'fused_mlp', 'glu')
-    if mlp_type == 'fused_mlp':
         assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
             "fused_mlp only " "supports approximate gelu"
         )
-    if mlp_type == 'glu':
-        assert config.hidden_act in ('relu', 'gelu')
-    if mlp_type == 'mlp':
         approximate = (
             "tanh"
             if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
@@ -109,16 +105,7 @@ def create_mlp_cls(config, layer_idx=None, return_residual=False):
             activation=partial(F.gelu, approximate=approximate),
             return_residual=return_residual,
         )
-    elif mlp_type == 'glu':
-        mlp_cls = partial(
-            GLUMLP,
-            hidden_features=inner_dim,
-            activation=config.hidden_act,
-            use_flash_attn=config.use_flash_attn,
-            hidden_dropout_prob=config.hidden_dropout_prob,
-            return_residual=return_residual,
-        )
-    elif mlp_type == 'fused_mlp':
         if FusedMLP is None:
             raise ImportError("fused_dense is not installed")
         mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
@@ -132,8 +119,6 @@ def create_mlp_cls(config, layer_idx=None, return_residual=False):
             checkpoint_lvl=mlp_checkpoint_lvl,
             return_residual=return_residual,
         )
-    else:
-        raise NotImplementedError
     return mlp_cls
@@ -167,7 +152,7 @@ def _init_weights(module, initializer_range=0.02):
         nn.init.normal_(module.weight, std=initializer_range)
         if module.bias is not None:
             nn.init.zeros_(module.bias)
-    elif isinstance(module, nn.Embedding):
         nn.init.normal_(module.weight, std=initializer_range)
         if module.padding_idx is not None:
             nn.init.zeros_(module.weight[module.padding_idx])
@@ -189,6 +174,8 @@ class BertEncoder(nn.Module):
     @gradient_checkpointing.setter
     def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
     def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
         """If subset_mask is not None, we only want output for the subset of the sequence.
@@ -200,15 +187,7 @@ class BertEncoder(nn.Module):
                 {"key_padding_mask": key_padding_mask.bool()} if key_padding_mask is not None else None
             )
             for layer in self.layers:
-                if self._grad_checkpointing:
-                    hidden_states = torch.utils.checkpoint.checkpoint(
-                        layer,
-                        hidden_states,
-                        use_reentrant=False,
-                        mixer_kwargs=mixer_kwargs
-                    )
-                else:
-                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
             if subset_mask is not None:
                 hidden_states = hidden_states[subset_mask]
         else:
@@ -219,27 +198,11 @@ class BertEncoder(nn.Module):
             mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
             if subset_mask is None:
                 for layer in self.layers:
-                    if self._grad_checkpointing:
-                        hidden_states = torch.utils.checkpoint.checkpoint(
-                            layer,
-                            hidden_states,
-                            use_reentrant=False,
-                            mixer_kwargs=mixer_kwargs
-                        )
-                    else:
-                        hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                 hidden_states = pad_input(hidden_states, indices, batch, seqlen)
             else:
                 for layer in self.layers[:-1]:
-                    if self._grad_checkpointing:
-                        hidden_states = torch.utils.checkpoint.checkpoint(
-                            layer,
-                            hidden_states,
-                            use_reentrant=False,
-                            mixer_kwargs=mixer_kwargs
-                        )
-                    else:
-                        hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                 if key_padding_mask is not None:
                     subset_idx = torch.nonzero(
                         subset_mask[key_padding_mask], as_tuple=False
@@ -265,15 +228,7 @@ class BertEncoder(nn.Module):
                     "cu_seqlens_k": cu_seqlens,
                     "max_seqlen_k": max_seqlen_in_batch,
                 }
-                if self._grad_checkpointing:
-                    torch.utils.checkpoint.checkpoint(
-                        self.layers[-1],
-                        hidden_states_subset,
-                        use_reentrant=False,
-                        mixer_kwargs=mixer_kwargs
-                    )
-                else:
-                    hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs)
         return hidden_states
@@ -396,16 +351,24 @@ class BertModel(BertPreTrainedModel):
         self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config) if add_pooling_layer else None
         self.emb_pooler = config.emb_pooler
         self._name_or_path = config._name_or_path
         if self.emb_pooler is not None:
             from transformers import AutoTokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path, trust_remote_code=True)
         else:
             self.tokenizer = None
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     def forward(
@@ -413,9 +376,9 @@ class BertModel(BertPreTrainedModel):
         input_ids,
         position_ids=None,
         token_type_ids=None,
         attention_mask=None,
         masked_tokens_mask=None,
-        return_dict=True,
     ):
         """If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
         we only want the output for the masked tokens. This means that we only compute the last
@@ -425,6 +388,8 @@ class BertModel(BertPreTrainedModel):
         hidden_states = self.embeddings(
             input_ids, position_ids=position_ids, token_type_ids=token_type_ids
         )
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
@@ -464,9 +429,6 @@ class BertModel(BertPreTrainedModel):
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
             pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
-        if not return_dict:
-            return (sequence_output, pooled_output)
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
@@ -522,7 +484,7 @@ class BertModel(BertPreTrainedModel):
             self.emb_pooler = 'mean'
             from transformers import AutoTokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(self._name_or_path, trust_remote_code=True)
         if self.emb_pooler != 'mean':
             raise NotImplementedError
@@ -723,84 +685,4 @@ class BertForPreTraining(BertPreTrainedModel):
             loss=total_loss,
             prediction_logits=prediction_scores,
             seq_relationship_logits=seq_relationship_score,
-        )
-class BertForMaskedLM(BertPreTrainedModel):
-    def __init__(self, config: JinaBertConfig):
-        super().__init__(config)
-        # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
-        # (around 15%) to the classifier heads.
-        self.dense_seq_output = getattr(config, "dense_seq_output", False)
-        # If last_layer_subset, we only need the compute the last layer for a subset of tokens
-        # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
-        self.last_layer_subset = getattr(config, "last_layer_subset", False)
-        if self.last_layer_subset:
-            assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
-        use_xentropy = getattr(config, "use_xentropy", False)
-        if use_xentropy and CrossEntropyLoss is None:
-            raise ImportError("xentropy_cuda is not installed")
-        loss_cls = (
-            nn.CrossEntropyLoss
-            if not use_xentropy
-            else partial(CrossEntropyLoss, inplace_backward=True)
-        )
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-        self.mlm_loss = loss_cls(ignore_index=0)
-        # Initialize weights and apply final processing
-        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-        self.tie_weights()
-    def tie_weights(self):
-        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
-    def get_input_embeddings(self):
-        return self.bert.embeddings.word_embeddings
-    def forward(
-        self,
-        input_ids,
-        position_ids=None,
-        token_type_ids=None,
-        attention_mask=None,
-        labels=None
-    ):
-        masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
-        outputs = self.bert(
-            input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            attention_mask=attention_mask.bool() if attention_mask is not None else None,
-            masked_tokens_mask=masked_tokens_mask,
-        )
-        sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
-        if self.dense_seq_output and labels is not None:
-            masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
-            if not self.last_layer_subset:
-                sequence_output = index_first_axis(
-                    rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
-                )
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-        if (
-                self.dense_seq_output and labels is not None
-        ):  # prediction_scores are already flattened
-            masked_lm_loss = self.mlm_loss(
-                prediction_scores, labels.flatten()[masked_token_idx]
-            ).float()
-        elif labels is not None:
-            masked_lm_loss = self.mlm_loss(
-                rearrange(prediction_scores, "... v -> (...) v"),
-                rearrange(labels, "... -> (...)"),
-            ).float()
-        else:
-            raise ValueError('MLM labels must not be None')
-        return BertForPreTrainingOutput(
-            loss=masked_lm_loss,
-            prediction_logits=prediction_scores,
-            seq_relationship_logits=seq_relationship_score,
-        )

 from .block import Block
 from .embedding import BertEmbeddings
 from .mha import MHA
+from .mlp import FusedMLP, Mlp
 try:
     from flash_attn.ops.fused_dense import FusedDense
         return_residual=return_residual,
         use_alibi=True,
         window_size=window_size,
+        qk_norm=use_qk_norm
     )
     return mixer_cls
 def create_mlp_cls(config, layer_idx=None, return_residual=False):
     inner_dim = config.intermediate_size
+    fused_mlp = getattr(config, "fused_mlp", False)
+    if fused_mlp:
         assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
             "fused_mlp only " "supports approximate gelu"
         )
+    if not fused_mlp:
         approximate = (
             "tanh"
             if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
             activation=partial(F.gelu, approximate=approximate),
             return_residual=return_residual,
         )
+    else:
         if FusedMLP is None:
             raise ImportError("fused_dense is not installed")
         mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
             checkpoint_lvl=mlp_checkpoint_lvl,
             return_residual=return_residual,
         )
     return mlp_cls
         nn.init.normal_(module.weight, std=initializer_range)
         if module.bias is not None:
             nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding) and not getattr(module, "skip_init", False):
         nn.init.normal_(module.weight, std=initializer_range)
         if module.padding_idx is not None:
             nn.init.zeros_(module.weight[module.padding_idx])
     @gradient_checkpointing.setter
     def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
+        for block in self.layers:
+            block.mixer.checkpointing = value
     def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
         """If subset_mask is not None, we only want output for the subset of the sequence.
                 {"key_padding_mask": key_padding_mask.bool()} if key_padding_mask is not None else None
             )
             for layer in self.layers:
+                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
             if subset_mask is not None:
                 hidden_states = hidden_states[subset_mask]
         else:
             mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
             if subset_mask is None:
                 for layer in self.layers:
+                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                 hidden_states = pad_input(hidden_states, indices, batch, seqlen)
             else:
                 for layer in self.layers[:-1]:
+                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                 if key_padding_mask is not None:
                     subset_idx = torch.nonzero(
                         subset_mask[key_padding_mask], as_tuple=False
                     "cu_seqlens_k": cu_seqlens,
                     "max_seqlen_k": max_seqlen_in_batch,
                 }
+                hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs)
         return hidden_states
         self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.encoder = BertEncoder(config)
         self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.task_type_embeddings = nn.Embedding(config.num_tasks, config.hidden_size)
         self.emb_pooler = config.emb_pooler
         self._name_or_path = config._name_or_path
         if self.emb_pooler is not None:
             from transformers import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
         else:
             self.tokenizer = None
+        # We now initialize the task embeddings to 0; We do not use task types during
+        # pretraining. When we start using task types during embedding training,
+        # we want the model to behave exactly as in pretraining (i.e. task types
+        # have no effect).
+        nn.init.zeros_(self.task_type_embeddings.weight)
+        self.task_type_embeddings.skip_init = True
+        # The following code should skip the embeddings layer
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
     def forward(
         input_ids,
         position_ids=None,
         token_type_ids=None,
+        task_type_ids=None,
         attention_mask=None,
         masked_tokens_mask=None,
     ):
         """If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
         we only want the output for the masked tokens. This means that we only compute the last
         hidden_states = self.embeddings(
             input_ids, position_ids=position_ids, token_type_ids=token_type_ids
         )
+        if task_type_ids is not None:
+            hidden_states = hidden_states + self.task_type_embeddings(task_type_ids)
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
             pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
             self.emb_pooler = 'mean'
             from transformers import AutoTokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(self._name_or_path)
         if self.emb_pooler != 'mean':
             raise NotImplementedError
             loss=total_loss,
             prediction_logits=prediction_scores,
             seq_relationship_logits=seq_relationship_score,
+        )

modeling_lora.py CHANGED Viewed

@@ -65,8 +65,6 @@ class LoRAParametrization(nn.Module):
         fan_in_fan_out = layer_type == "embedding"
         self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
-        # For the officially "correct" LoRA initialization, check here: https://github.com/microsoft/LoRA
-        # TODO: Ensure that the initialization here is correct
         if layer_type == "linear":
             self.lora_A = nn.Parameter(
                 initialized_weights((rank, fan_in), num_adaptions, init="kaiming")
@@ -196,64 +194,30 @@ class LoRAParametrization(nn.Module):
                 ),
             )
-    @staticmethod
-    def select_task_for_layer(layer: nn.Module, task_idx: Optional[int] = None):
         if isinstance(layer, LoRAParametrization):
             layer.current_task = task_idx
-    @staticmethod
-    def merge_lora_into_layer(layer: nn.Module):
-        if hasattr(layer, "parametrizations"):
-            for attr_name in layer.parametrizations.keys():
-                parametrize.remove_parametrizations(layer, attr_name, leave_parametrized=True)
 class BertLoRA(BertPreTrainedModel):
-    def __init__(self, config: JinaBertConfig, bert: Optional[BertModel] = None, add_pooling_layer=True):
         super().__init__(config)
         if bert is None:
             self.bert = BertModel(config, add_pooling_layer=add_pooling_layer)
         else:
             self.bert = bert
-        self._is_merged = False
-        self._num_adaptions = config.num_loras
-        self._register_lora(self._num_adaptions)
-        self.main_params_trainable = False
-        self._task_idx = None
-        # By default, we select the first LoRA
-        self.current_task = 0
-    @property
-    def main_params_trainable(self):
-        return self._main_params_trainable
-    @main_params_trainable.setter
-    def main_params_trainable(self, val: bool):
-        """Whether the main parameters (i.e. those that are not LoRA) should be trainable.
-        This method sets the `requires_grad_` attribute of the main weights
-        and controls which parameters are returned in `self.parameters()`.
-        :param val: Whether or not to make the parameters trainable.
-        :return: None
-        """
-        self._main_params_trainable = val
         for name, param in super().named_parameters():
             if "lora" not in name:
-                param.requires_grad_(val)
     @classmethod
-    def from_bert(cls, *args, **kwargs):
         bert = BertModel.from_pretrained(*args, **kwargs)
         config = JinaBertConfig.from_pretrained(*args, **kwargs)
-        return cls(config, bert=bert)
-    def merge_lora(self):
-        """Merges currently selected LoRA into main weights."""
-        if self._is_merged:
-            raise Exception('LoRA has already been merged, cannot merge again')
-        self._is_merged = True
-        self.apply(LoRAParametrization.merge_lora_into_layer)
     @classmethod
     def from_pretrained(
@@ -270,13 +234,7 @@ class BertLoRA(BertPreTrainedModel):
         use_safetensors: bool = None,
         **kwargs,
     ):
-        """
-        TODO: choose between from_bert and super().from_pretrained
-        We want to be able to load both a pretrained BertModel, and a trained
-        BertLoRA via this method. To this end, we need to check which of these
-        models we are expected to load.
-        """
         return cls.from_bert(pretrained_model_name_or_path)
     def _register_lora(self, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
@@ -292,34 +250,16 @@ class BertLoRA(BertPreTrainedModel):
     @property
     def current_task(self):
-        """ Which LoRA is currently selected
-        :return: Integer or None (when LoRA is disabled)
-        """
         return self._task_idx
     @current_task.setter
     def current_task(self, task_idx: Union[None, int]):
-        """Set the LoRA that is to be used.
-        The LoRA is specified by `task_idx`, which may be an integer >= 0,
-        indexing the available LoRAs. If it is None, no LoRA is used.
-        :param task_idx: Which LoRA to use
-        :return:
-        """
-        if self._is_merged:
-            raise Exception('LoRA has been merged, cannot select new task')
-        assert task_idx is None or 0 <= task_idx < self._num_adaptions
-        if self._task_idx != task_idx:
-            # In this case, we need to update the LoRAs everywhere
-            self._task_idx = task_idx
-            self.apply(
-                partial(LoRAParametrization.select_task_for_layer, task_idx=task_idx)
-            )
-    def forward(self, *args, current_task: Union[None, int] = -1, **kwargs):
-        if current_task is None or current_task >= 0:
-            self.current_task = current_task
         return self.bert(*args, **kwargs)
     def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
@@ -332,5 +272,5 @@ class BertLoRA(BertPreTrainedModel):
         for name, param in super().named_parameters(
             prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate
         ):
-            if "lora" in name or self.main_params_trainable:
                 yield name, param

         fan_in_fan_out = layer_type == "embedding"
         self.swap = (lambda x: (x[1], x[0])) if fan_in_fan_out else (lambda x: x)
         if layer_type == "linear":
             self.lora_A = nn.Parameter(
                 initialized_weights((rank, fan_in), num_adaptions, init="kaiming")
                 ),
             )
+    @classmethod
+    def select_task_for_layer(cls, layer: nn.Module, task_idx: Optional[int] = None):
         if isinstance(layer, LoRAParametrization):
             layer.current_task = task_idx
 class BertLoRA(BertPreTrainedModel):
+    def __init__(self, config: JinaBertConfig, bert: Optional[BertModel] = None, add_pooling_layer=True, num_adaptions=1):
         super().__init__(config)
         if bert is None:
             self.bert = BertModel(config, add_pooling_layer=add_pooling_layer)
         else:
             self.bert = bert
+        self._register_lora(num_adaptions)
         for name, param in super().named_parameters():
             if "lora" not in name:
+                param.requires_grad_(False)
+        self.current_task = 0
     @classmethod
+    def from_bert(cls, *args, num_adaptions=1, **kwargs):
         bert = BertModel.from_pretrained(*args, **kwargs)
         config = JinaBertConfig.from_pretrained(*args, **kwargs)
+        return cls(config, bert=bert, num_adaptions=num_adaptions)
     @classmethod
     def from_pretrained(
         use_safetensors: bool = None,
         **kwargs,
     ):
+        # TODO: choose between from_bert and super().from_pretrained
         return cls.from_bert(pretrained_model_name_or_path)
     def _register_lora(self, num_adaptions=1, rank=4, lora_dropout_p=0.0, lora_alpha=1):
     @property
     def current_task(self):
         return self._task_idx
     @current_task.setter
     def current_task(self, task_idx: Union[None, int]):
+        self._task_idx = task_idx
+        self.apply(
+            partial(LoRAParametrization.select_task_for_layer, task_idx=task_idx)
+        )
+    def forward(self, *args, **kwargs):
         return self.bert(*args, **kwargs)
     def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
         for name, param in super().named_parameters(
             prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate
         ):
+            if "lora" in name:
                 yield name, param

tokenizer.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import numpy as np
+from transformers import RobertaTokenizer, BatchEncoding, RobertaTokenizerFast
+import warnings
+def get_tokenizer(parent_class):
+    class TokenizerClass(parent_class):
+        def __init__(self, *args, **kwargs):
+            """
+            This class dynamically extends a given tokenizer class from the HF
+            Transformers library (RobertaTokenizer or RobertaTokenizerFast).
+            The task_type_ids are used to pass instruction information to the model.
+            A task_type should either be an integer or a sequence of integers with the same
+            length as the batch size.
+            """
+            super().__init__(*args, **kwargs)
+        def __call__(self, *args, task_type=None, **kwargs):
+            batch_encoding = super().__call__(*args, **kwargs)
+            if task_type is not None:
+                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
+            return batch_encoding
+        def _batch_encode_plus(self, *args, task_type=None, **kwargs):
+            batch_encoding = super()._batch_encode_plus(*args, **kwargs)
+            if task_type is not None:
+                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
+            return batch_encoding
+        def _encode_plus(self, *args, task_type=None, **kwargs):
+            batch_encoding = super()._encode_plus(*args, **kwargs)
+            if task_type is not None:
+                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
+            return batch_encoding
+        @classmethod
+        def _add_task_type_ids(cls, batch_encoding, task_type, tensor_type):
+            return BatchEncoding(
+                {
+                    'task_type_ids': cls._get_task_type_ids(batch_encoding, task_type),
+                    **batch_encoding,
+                },
+                tensor_type=tensor_type,
+            )
+        @staticmethod
+        def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):
+            def apply_task_type(m, x):
+                x = torch.tensor(x)
+                assert (
+                        len(x.shape) == 0 or x.shape[0] == m.shape[0]
+                ), 'The shape of task_type does not match the size of the batch.'
+                return m * x if len(x.shape) == 0 else m * x[:, None]
+            if isinstance(batch_encoding['input_ids'], torch.Tensor):
+                shape = batch_encoding['input_ids'].shape
+                return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+            else:
+                try:
+                    shape = torch.tensor(batch_encoding['input_ids']).shape
+                except:
+                    raise ValueError(
+                        "Unable to create tensor, you should probably "
+                        "activate truncation and/or padding with "
+                        "'padding=True' 'truncation=True' to have batched "
+                        "tensors with the same length."
+                    )
+                if isinstance(batch_encoding['input_ids'], list):
+                    return (
+                        apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+                    ).tolist()
+                elif isinstance(batch_encoding['input_ids'], np.array):
+                    return (
+                        apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+                    ).numpy()
+                else:
+                    warnings.warn(
+                        'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
+                    )
+                    return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+    return TokenizerClass
+JinaTokenizer = get_tokenizer(RobertaTokenizer)
+JinaTokenizerFast = get_tokenizer(RobertaTokenizerFast)