Synthyra
/

ESMplusplus_small

Fill-Mask

Transformers

Safetensors

ESMplusplus

custom_code

Model card Files Files and versions Community

lhallee commited on 3 days ago

Commit

cd0641b

verified ·

1 Parent(s): cfe6f59

Upload modeling_esm_plusplus.py with huggingface_hub

Browse files

Files changed (1) hide show

modeling_esm_plusplus.py +219 -101

modeling_esm_plusplus.py CHANGED Viewed

@@ -16,15 +16,16 @@ import torch.nn.functional as F
 from dataclasses import dataclass
 from functools import cache, partial
 from pathlib import Path
-from typing import Optional, Tuple, Union
 from einops import rearrange, repeat
 from huggingface_hub import snapshot_download
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.processors import TemplateProcessing
-from torch.utils.data import Dataset, DataLoader
 from tqdm.auto import tqdm
-from transformers import PreTrainedModel, PreTrainedTokenizerFast, PretrainedConfig
 from transformers.modeling_outputs import ModelOutput
@@ -501,8 +502,90 @@ class TransformerStack(nn.Module):
         )
-### Dataset for Embedding
-class ProteinDataset(Dataset):
     """Simple dataset for protein sequences."""
     def __init__(self, sequences: list[str]):
         self.sequences = sequences
@@ -514,68 +597,22 @@ class ProteinDataset(Dataset):
         return self.sequences[idx]
-class PreTrainedESMplusplusModel(PreTrainedModel):
-    """
-    init weights for ESM++ models
-    """
-    config_class = ESMplusplusConfig
-    base_model_prefix = "esm++"
-    supports_gradient_checkpointing = True
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            if module.bias is not None:
-                module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-    @classmethod
-    def from_pretrained_esm(cls, model_name: str):
-        """Load a pretrained ESM++ model."""
-        if '300' in model_name:
-            return ESMplusplus_300M()
-        elif '600' in model_name:
-            return ESMplusplus_600M()
-        else:
-            raise ValueError(f"Invalid model name: {model_name}")
     @property
     def device(self) -> torch.device:
         """Get the device of the model."""
         return next(self.parameters()).device
-    def mean_pooling(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """Apply mean pooling to sequence outputs."""
-        if attention_mask is None:
-            return x.mean(dim=1)
-        else:
-            attention_mask = attention_mask.unsqueeze(-1)
-            return (x * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
-    def max_pooling(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """Apply max pooling to sequence outputs."""
-        if attention_mask is None:
-            return x.max(dim=1).values
-        else:
-            attention_mask = attention_mask.unsqueeze(-1)
-            return (x * attention_mask).max(dim=1).values
-    def cls_pooling(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        """Apply cls pooling to sequence outputs."""
-        return x[:, 0, :]
-    def _collate_fn(self, sequences: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Collate function for batching sequences."""
-        return self.tokenizer(sequences, return_tensors="pt", padding='longest', pad_to_multiple_of=8)
     def _read_sequences_from_db(self, db_path: str) -> set[str]:
         """Read sequences from SQLite database."""
         import sqlite3
@@ -592,15 +629,18 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
     def embed_dataset(
         self,
-        sequences: list[str],
         batch_size: int = 2,
         max_len: int = 512,
         full_embeddings: bool = False,
-        full_precision: bool = False,
-        pooling_type: str = 'mean',
         num_workers: int = 0,
         sql: bool = False,
         sql_db_path: str = 'embeddings.db',
     ) -> Optional[dict[str, torch.Tensor]]:
         """Embed a dataset of protein sequences.
@@ -609,7 +649,6 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
             batch_size: Batch size for processing
             max_len: Maximum sequence length
             full_embeddings: Whether to return full residue-wise (True) embeddings or pooled (False)
-            full_precision: Whether to cast to full precision (float32) before storage - relevant for dict storage
             pooling_type: Type of pooling ('mean' or 'cls')
             num_workers: Number of workers for data loading, 0 for the main process
             sql: Whether to store embeddings in SQLite database - will be stored in float32
@@ -617,23 +656,46 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
         Returns:
             Dictionary mapping sequences to embeddings, or None if sql=True
         """
         sequences = list(set([seq[:max_len] for seq in sequences]))
         device = self.device
         def get_embeddings(residue_embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-            if full_embeddings:
                 return residue_embeddings
-            elif pooling_type == 'mean':
-                return self.mean_pooling(residue_embeddings, attention_mask)
-            elif pooling_type == 'max':
-                return self.max_pooling(residue_embeddings, attention_mask)
-            elif pooling_type == 'cls':
-                return self.cls_pooling(residue_embeddings, attention_mask)
             else:
-                raise ValueError(f"Invalid pooling type: {pooling_type}")
-        sequences = list(set([seq[:max_len] for seq in sequences]))
         if sql:
             import sqlite3
             conn = sqlite3.connect(sql_db_path)
@@ -644,17 +706,14 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
             print(f"Found {len(already_embedded)} already embedded sequences in {sql_db_path}")
             print(f"Embedding {len(to_embed)} new sequences")
             if len(to_embed) > 0:
-                to_embed = sorted(to_embed, key=len, reverse=True)
                 dataset = ProteinDataset(to_embed)
-                dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=self._collate_fn, shuffle=False)
                 with torch.no_grad():
                     for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
                         seqs = to_embed[i * batch_size:(i + 1) * batch_size]
                         input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
-                        x = self.embed(input_ids)
-                        residue_embeddings = self.transformer(x, attention_mask).last_hidden_state.detach().float() # required for sql
-                        embeddings = get_embeddings(residue_embeddings, attention_mask)
                         for seq, emb, mask in zip(seqs, embeddings, attention_mask):
                             if full_embeddings:
                                 emb = emb[mask.bool()]
@@ -669,32 +728,75 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
             return None
         embeddings_dict = {}
-        sequences = sorted(sequences, key=len, reverse=True)
-        dataset = ProteinDataset(sequences)
-        dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=self._collate_fn, shuffle=False)
-        with torch.no_grad():
-            for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
-                seqs = sequences[i * batch_size:(i + 1) * batch_size]
-                input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
-                x = self.embed(input_ids)
-                residue_embeddings = self.transformer(x, attention_mask).last_hidden_state.detach()
-                if full_precision:
-                    residue_embeddings = residue_embeddings.float()
-                embeddings = get_embeddings(residue_embeddings, attention_mask).cpu()
-                for seq, emb in zip(seqs, embeddings):
-                    embeddings_dict[seq] = emb
         return embeddings_dict
 ### ESM++ Models
-class ESMplusplusModel(PreTrainedESMplusplusModel):
     """
     ESM++ model. transformer model with no heads
     """
     config_class = ESMplusplusConfig
     def __init__(self, config: ESMplusplusConfig, **kwargs):
-        super().__init__(config, **kwargs)
         self.config = config
         self.vocab_size = config.vocab_size
         self.embed = nn.Embedding(self.vocab_size, config.hidden_size)
@@ -708,6 +810,10 @@ class ESMplusplusModel(PreTrainedESMplusplusModel):
     def set_input_embeddings(self, value):
         self.embed = value
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -736,14 +842,14 @@ class ESMplusplusModel(PreTrainedESMplusplusModel):
         return self.transformer(x, attention_mask, output_hidden_states, output_attentions)
-class ESMplusplusForMaskedLM(PreTrainedESMplusplusModel):
     """
     ESM++ model for masked language modeling.
     Implements the base ESM++ architecture with a masked language modeling head.
     """
     config_class = ESMplusplusConfig
     def __init__(self, config: ESMplusplusConfig, **kwargs):
-        super().__init__(config, **kwargs)
         self.config = config
         self.vocab_size = config.vocab_size
         self.embed = nn.Embedding(self.vocab_size, config.hidden_size)
@@ -765,6 +871,10 @@ class ESMplusplusForMaskedLM(PreTrainedESMplusplusModel):
     def set_output_embeddings(self, new_embeddings):
         self.sequence_head[-1] = new_embeddings
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -807,13 +917,13 @@ class ESMplusplusForMaskedLM(PreTrainedESMplusplusModel):
         )
-class ESMplusplusForSequenceClassification(ESMplusplusForMaskedLM):
     """
     ESM++ model for sequence classification.
     Extends the base ESM++ model with a classification head.
     """
     def __init__(self, config: ESMplusplusConfig, **kwargs):
-        super().__init__(config, **kwargs)
         self.config = config
         self.num_labels = config.num_labels
         self.classifier = RegressionHead(config.hidden_size * 2, config.num_labels, config.hidden_size * 4)
@@ -823,6 +933,10 @@ class ESMplusplusForSequenceClassification(ESMplusplusForMaskedLM):
         self.bce = nn.BCEWithLogitsLoss()
         self.init_weights()
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
@@ -888,13 +1002,13 @@ class ESMplusplusForSequenceClassification(ESMplusplusForMaskedLM):
         )
-class ESMplusplusForTokenClassification(ESMplusplusForMaskedLM):
     """
     ESM++ model for token classification.
     Extends the base ESM++ model with a token classification head.
     """
     def __init__(self, config: ESMplusplusConfig):
-        super().__init__(config)
         self.config = config
         self.num_labels = config.num_labels
         self.classifier = RegressionHead(config.hidden_size, config.num_labels, config.hidden_size * 4)
@@ -902,6 +1016,10 @@ class ESMplusplusForTokenClassification(ESMplusplusForMaskedLM):
         self.loss_fct = nn.CrossEntropyLoss()
         self.init_weights()
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,

 from dataclasses import dataclass
 from functools import cache, partial
 from pathlib import Path
+from typing import Optional, Tuple, Union, List, Callable, Dict
 from einops import rearrange, repeat
 from huggingface_hub import snapshot_download
 from tokenizers import Tokenizer
 from tokenizers.models import BPE
 from tokenizers.processors import TemplateProcessing
+from torch.utils.data import Dataset as TorchDataset
+from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from transformers import PreTrainedModel, PreTrainedTokenizerFast, PreTrainedTokenizerBase, PretrainedConfig
 from transformers.modeling_outputs import ModelOutput
         )
+### Support for embedding datasets with low code
+class Pooler:
+    def __init__(self, pooling_types: List[str]):
+        self.pooling_types = pooling_types
+        self.pooling_options = {
+            'mean': self.mean_pooling,
+            'max': self.max_pooling,
+            'min': self.min_pooling,
+            'norm': self.norm_pooling,
+            'prod': self.prod_pooling,
+            'median': self.median_pooling,
+            'std': self.std_pooling,
+            'var': self.var_pooling,
+            'cls': self.cls_pooling,
+        }
+    def mean_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        if attention_mask is None:
+            return emb.mean(dim=1)
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return (emb * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
+    def max_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        if attention_mask is None:
+            return emb.max(dim=1).values
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return (emb * attention_mask).max(dim=1).values
+    def min_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        if attention_mask is None:
+            return emb.min(dim=1).values
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return (emb * attention_mask).min(dim=1).values
+    def norm_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        if attention_mask is None:
+            return emb.norm(dim=1, p=2)
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return (emb * attention_mask).norm(dim=1, p=2)
+    def prod_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        length = emb.shape[1]
+        if attention_mask is None:
+            return emb.prod(dim=1) / length
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return ((emb * attention_mask).prod(dim=1) / attention_mask.sum(dim=1)) / length
+    def median_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        if attention_mask is None:
+            return emb.median(dim=1).values
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return (emb * attention_mask).median(dim=1).values
+    def std_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        if attention_mask is None:
+            return emb.std(dim=1)
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return (emb * attention_mask).std(dim=1)
+    def var_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        if attention_mask is None:
+            return emb.var(dim=1)
+        else:
+            attention_mask = attention_mask.unsqueeze(-1)
+            return (emb * attention_mask).var(dim=1)
+    def cls_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
+        return emb[:, 0, :]
+    def __call__(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # [mean, max]
+        final_emb = []
+        for pooling_type in self.pooling_types:
+            final_emb.append(self.pooling_options[pooling_type](emb, attention_mask)) # (b, d)
+        return torch.cat(final_emb, dim=-1) # (b, n_pooling_types * d)
+class ProteinDataset(TorchDataset):
     """Simple dataset for protein sequences."""
     def __init__(self, sequences: list[str]):
         self.sequences = sequences
         return self.sequences[idx]
+def build_collator(tokenizer) -> Callable[[list[str]], tuple[torch.Tensor, torch.Tensor]]:
+    def _collate_fn(sequences: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
+        """Collate function for batching sequences."""
+        return tokenizer(sequences, return_tensors="pt", padding='longest', pad_to_multiple_of=8)
+    return _collate_fn
+class EmbeddingMixin:
+    def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError
     @property
     def device(self) -> torch.device:
         """Get the device of the model."""
         return next(self.parameters()).device
     def _read_sequences_from_db(self, db_path: str) -> set[str]:
         """Read sequences from SQLite database."""
         import sqlite3
     def embed_dataset(
         self,
+        sequences: List[str],
+        tokenizer: PreTrainedTokenizerBase,
         batch_size: int = 2,
         max_len: int = 512,
         full_embeddings: bool = False,
+        embed_dtype: torch.dtype = torch.float32,
+        pooling_types: List[str] = ['mean'],
         num_workers: int = 0,
         sql: bool = False,
+        save: bool = True,
         sql_db_path: str = 'embeddings.db',
+        save_path: str = 'embeddings.pth',
     ) -> Optional[dict[str, torch.Tensor]]:
         """Embed a dataset of protein sequences.
             batch_size: Batch size for processing
             max_len: Maximum sequence length
             full_embeddings: Whether to return full residue-wise (True) embeddings or pooled (False)
             pooling_type: Type of pooling ('mean' or 'cls')
             num_workers: Number of workers for data loading, 0 for the main process
             sql: Whether to store embeddings in SQLite database - will be stored in float32
         Returns:
             Dictionary mapping sequences to embeddings, or None if sql=True
+        Note:
+            - If sql=True, embeddings can only be stored in float32
+            - sql is ideal if you need to stream a very large dataset for training in real-time
+            - save=True is ideal if you can store the entire embedding dictionary in RAM
+            - sql will be used if it is True and save is True or False
+            - If your sql database or .pth file is already present, they will be scanned first for already embedded sequences
+            - Sequences will be truncated to max_len and sorted by length in descending order for faster processing
+        Example:
+            >>> embedder = EmbeddingMixin()
+            >>> embedding_dict = embedder.embed_dataset(
+                sequences=[
+                    'MALWMRLLPLLALLALWGPDPAAA', ... # list of protein sequences
+                ],
+                batch_size=2, # adjust for your GPU memory
+                max_len=512, # adjust for your needs
+                full_embeddings=False, # if True, no pooling is performed
+                embed_dtype=torch.float32, # cast to what dtype you want
+                pooling_type=['mean', 'cls'], # more than one pooling type will be concatenated together
+                num_workers=0, # if you have many cpu cores, we find that num_workers = 4 is fast for large datasets
+                sql=False, # if True, embeddings will be stored in SQLite database
+                sql_db_path='embeddings.db',
+                save=True, # if True, embeddings will be saved as a .pth file
+                save_path='embeddings.pth',
+            )
+            >>> # embedding_dict is a dictionary mapping sequences to their embeddings as tensors for .pth or numpy arrays for sql
         """
         sequences = list(set([seq[:max_len] for seq in sequences]))
+        sequences = sorted(sequences, key=len, reverse=True)
+        collate_fn = build_collator(tokenizer)
         device = self.device
+        pooler = Pooler(pooling_types) if not full_embeddings else None
         def get_embeddings(residue_embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+            if full_embeddings or residue_embeddings.ndim == 2: # if already pooled or want residue-wise embeddings
                 return residue_embeddings
             else:
+                return pooler(residue_embeddings, attention_mask)
         if sql:
             import sqlite3
             conn = sqlite3.connect(sql_db_path)
             print(f"Found {len(already_embedded)} already embedded sequences in {sql_db_path}")
             print(f"Embedding {len(to_embed)} new sequences")
             if len(to_embed) > 0:
                 dataset = ProteinDataset(to_embed)
+                dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, shuffle=False)
                 with torch.no_grad():
                     for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
                         seqs = to_embed[i * batch_size:(i + 1) * batch_size]
                         input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
+                        residue_embeddings = self._embed(input_ids, attention_mask).float() # sql requires float32
+                        embeddings = get_embeddings(residue_embeddings, attention_mask).cpu()
                         for seq, emb, mask in zip(seqs, embeddings, attention_mask):
                             if full_embeddings:
                                 emb = emb[mask.bool()]
             return None
         embeddings_dict = {}
+        if os.path.exists(save_path):
+            embeddings_dict = torch.load(save_path, map_location='cpu', weights_only=True)
+            to_embed = [seq for seq in sequences if seq not in embeddings_dict]
+            print(f"Found {len(embeddings_dict)} already embedded sequences in {save_path}")
+            print(f"Embedding {len(to_embed)} new sequences")
+        else:
+            to_embed = sequences
+            print(f"Embedding {len(to_embed)} new sequences")
+        if len(to_embed) > 0:
+            dataset = ProteinDataset(to_embed)
+            dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, shuffle=False)
+            with torch.no_grad():
+                for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
+                    seqs = to_embed[i * batch_size:(i + 1) * batch_size]
+                    input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
+                    residue_embeddings = self._embed(input_ids, attention_mask)
+                    embeddings = get_embeddings(residue_embeddings, attention_mask).to(embed_dtype).cpu()
+                    for seq, emb in zip(seqs, embeddings):
+                        embeddings_dict[seq] = emb
+        if save:
+            torch.save(embeddings_dict, save_path)
         return embeddings_dict
+class PreTrainedESMplusplusModel(PreTrainedModel):
+    """
+    init weights for ESM++ models
+    """
+    config_class = ESMplusplusConfig
+    base_model_prefix = "esm++"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            if module.bias is not None:
+                module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    @classmethod
+    def from_pretrained_esm(cls, model_name: str):
+        """Load a pretrained ESM++ model."""
+        if '300' in model_name:
+            return ESMplusplus_300M()
+        elif '600' in model_name:
+            return ESMplusplus_600M()
+        else:
+            raise ValueError(f"Invalid model name: {model_name}")
 ### ESM++ Models
+class ESMplusplusModel(PreTrainedESMplusplusModel, EmbeddingMixin):
     """
     ESM++ model. transformer model with no heads
     """
     config_class = ESMplusplusConfig
     def __init__(self, config: ESMplusplusConfig, **kwargs):
+        super(PreTrainedESMplusplusModel, self).__init__(config, **kwargs)
         self.config = config
         self.vocab_size = config.vocab_size
         self.embed = nn.Embedding(self.vocab_size, config.hidden_size)
     def set_input_embeddings(self, value):
         self.embed = value
+    def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = self.embed(input_ids)
+        return self.transformer(x, attention_mask, output_hidden_states=False, output_attentions=False).last_hidden_state
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         return self.transformer(x, attention_mask, output_hidden_states, output_attentions)
+class ESMplusplusForMaskedLM(PreTrainedESMplusplusModel, EmbeddingMixin):
     """
     ESM++ model for masked language modeling.
     Implements the base ESM++ architecture with a masked language modeling head.
     """
     config_class = ESMplusplusConfig
     def __init__(self, config: ESMplusplusConfig, **kwargs):
+        super(PreTrainedESMplusplusModel, self).__init__(config, **kwargs)
         self.config = config
         self.vocab_size = config.vocab_size
         self.embed = nn.Embedding(self.vocab_size, config.hidden_size)
     def set_output_embeddings(self, new_embeddings):
         self.sequence_head[-1] = new_embeddings
+    def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = self.embed(input_ids)
+        return self.transformer(x, attention_mask, output_hidden_states=False, output_attentions=False).last_hidden_state
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         )
+class ESMplusplusForSequenceClassification(ESMplusplusForMaskedLM, EmbeddingMixin):
     """
     ESM++ model for sequence classification.
     Extends the base ESM++ model with a classification head.
     """
     def __init__(self, config: ESMplusplusConfig, **kwargs):
+        super(ESMplusplusForMaskedLM, self).__init__(config, **kwargs)
         self.config = config
         self.num_labels = config.num_labels
         self.classifier = RegressionHead(config.hidden_size * 2, config.num_labels, config.hidden_size * 4)
         self.bce = nn.BCEWithLogitsLoss()
         self.init_weights()
+    def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = self.embed(input_ids)
+        return self.transformer(x, attention_mask, output_hidden_states=False, output_attentions=False).last_hidden_state
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         )
+class ESMplusplusForTokenClassification(ESMplusplusForMaskedLM, EmbeddingMixin):
     """
     ESM++ model for token classification.
     Extends the base ESM++ model with a token classification head.
     """
     def __init__(self, config: ESMplusplusConfig):
+        super(ESMplusplusForMaskedLM, self).__init__(config)
         self.config = config
         self.num_labels = config.num_labels
         self.classifier = RegressionHead(config.hidden_size, config.num_labels, config.hidden_size * 4)
         self.loss_fct = nn.CrossEntropyLoss()
         self.init_weights()
+    def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        x = self.embed(input_ids)
+        return self.transformer(x, attention_mask, output_hidden_states=False, output_attentions=False).last_hidden_state
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,