Upload modeling_esm_plusplus.py with huggingface_hub
Browse files- modeling_esm_plusplus.py +219 -101
modeling_esm_plusplus.py
CHANGED
@@ -16,15 +16,16 @@ import torch.nn.functional as F
|
|
16 |
from dataclasses import dataclass
|
17 |
from functools import cache, partial
|
18 |
from pathlib import Path
|
19 |
-
from typing import Optional, Tuple, Union
|
20 |
from einops import rearrange, repeat
|
21 |
from huggingface_hub import snapshot_download
|
22 |
from tokenizers import Tokenizer
|
23 |
from tokenizers.models import BPE
|
24 |
from tokenizers.processors import TemplateProcessing
|
25 |
-
from torch.utils.data import Dataset
|
|
|
26 |
from tqdm.auto import tqdm
|
27 |
-
from transformers import PreTrainedModel, PreTrainedTokenizerFast, PretrainedConfig
|
28 |
from transformers.modeling_outputs import ModelOutput
|
29 |
|
30 |
|
@@ -501,8 +502,90 @@ class TransformerStack(nn.Module):
|
|
501 |
)
|
502 |
|
503 |
|
504 |
-
###
|
505 |
-
class
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
506 |
"""Simple dataset for protein sequences."""
|
507 |
def __init__(self, sequences: list[str]):
|
508 |
self.sequences = sequences
|
@@ -514,68 +597,22 @@ class ProteinDataset(Dataset):
|
|
514 |
return self.sequences[idx]
|
515 |
|
516 |
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
base_model_prefix = "esm++"
|
523 |
-
supports_gradient_checkpointing = True
|
524 |
|
525 |
-
def _init_weights(self, module):
|
526 |
-
"""Initialize the weights"""
|
527 |
-
if isinstance(module, nn.Linear):
|
528 |
-
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
529 |
-
if module.bias is not None:
|
530 |
-
module.bias.data.zero_()
|
531 |
-
elif isinstance(module, nn.Embedding):
|
532 |
-
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
533 |
-
if module.padding_idx is not None:
|
534 |
-
module.weight.data[module.padding_idx].zero_()
|
535 |
-
elif isinstance(module, nn.LayerNorm):
|
536 |
-
if module.bias is not None:
|
537 |
-
module.bias.data.zero_()
|
538 |
-
module.weight.data.fill_(1.0)
|
539 |
|
540 |
-
|
541 |
-
def
|
542 |
-
|
543 |
-
if '300' in model_name:
|
544 |
-
return ESMplusplus_300M()
|
545 |
-
elif '600' in model_name:
|
546 |
-
return ESMplusplus_600M()
|
547 |
-
else:
|
548 |
-
raise ValueError(f"Invalid model name: {model_name}")
|
549 |
|
550 |
@property
|
551 |
def device(self) -> torch.device:
|
552 |
"""Get the device of the model."""
|
553 |
return next(self.parameters()).device
|
554 |
|
555 |
-
def mean_pooling(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
556 |
-
"""Apply mean pooling to sequence outputs."""
|
557 |
-
if attention_mask is None:
|
558 |
-
return x.mean(dim=1)
|
559 |
-
else:
|
560 |
-
attention_mask = attention_mask.unsqueeze(-1)
|
561 |
-
return (x * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
|
562 |
-
|
563 |
-
def max_pooling(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
564 |
-
"""Apply max pooling to sequence outputs."""
|
565 |
-
if attention_mask is None:
|
566 |
-
return x.max(dim=1).values
|
567 |
-
else:
|
568 |
-
attention_mask = attention_mask.unsqueeze(-1)
|
569 |
-
return (x * attention_mask).max(dim=1).values
|
570 |
-
|
571 |
-
def cls_pooling(self, x: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
572 |
-
"""Apply cls pooling to sequence outputs."""
|
573 |
-
return x[:, 0, :]
|
574 |
-
|
575 |
-
def _collate_fn(self, sequences: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
|
576 |
-
"""Collate function for batching sequences."""
|
577 |
-
return self.tokenizer(sequences, return_tensors="pt", padding='longest', pad_to_multiple_of=8)
|
578 |
-
|
579 |
def _read_sequences_from_db(self, db_path: str) -> set[str]:
|
580 |
"""Read sequences from SQLite database."""
|
581 |
import sqlite3
|
@@ -592,15 +629,18 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
|
|
592 |
|
593 |
def embed_dataset(
|
594 |
self,
|
595 |
-
sequences:
|
|
|
596 |
batch_size: int = 2,
|
597 |
max_len: int = 512,
|
598 |
full_embeddings: bool = False,
|
599 |
-
|
600 |
-
|
601 |
num_workers: int = 0,
|
602 |
sql: bool = False,
|
|
|
603 |
sql_db_path: str = 'embeddings.db',
|
|
|
604 |
) -> Optional[dict[str, torch.Tensor]]:
|
605 |
"""Embed a dataset of protein sequences.
|
606 |
|
@@ -609,7 +649,6 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
|
|
609 |
batch_size: Batch size for processing
|
610 |
max_len: Maximum sequence length
|
611 |
full_embeddings: Whether to return full residue-wise (True) embeddings or pooled (False)
|
612 |
-
full_precision: Whether to cast to full precision (float32) before storage - relevant for dict storage
|
613 |
pooling_type: Type of pooling ('mean' or 'cls')
|
614 |
num_workers: Number of workers for data loading, 0 for the main process
|
615 |
sql: Whether to store embeddings in SQLite database - will be stored in float32
|
@@ -617,23 +656,46 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
|
|
617 |
|
618 |
Returns:
|
619 |
Dictionary mapping sequences to embeddings, or None if sql=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
620 |
"""
|
621 |
sequences = list(set([seq[:max_len] for seq in sequences]))
|
|
|
|
|
622 |
device = self.device
|
|
|
623 |
|
624 |
def get_embeddings(residue_embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
625 |
-
if full_embeddings:
|
626 |
return residue_embeddings
|
627 |
-
elif pooling_type == 'mean':
|
628 |
-
return self.mean_pooling(residue_embeddings, attention_mask)
|
629 |
-
elif pooling_type == 'max':
|
630 |
-
return self.max_pooling(residue_embeddings, attention_mask)
|
631 |
-
elif pooling_type == 'cls':
|
632 |
-
return self.cls_pooling(residue_embeddings, attention_mask)
|
633 |
else:
|
634 |
-
|
635 |
|
636 |
-
sequences = list(set([seq[:max_len] for seq in sequences]))
|
637 |
if sql:
|
638 |
import sqlite3
|
639 |
conn = sqlite3.connect(sql_db_path)
|
@@ -644,17 +706,14 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
|
|
644 |
print(f"Found {len(already_embedded)} already embedded sequences in {sql_db_path}")
|
645 |
print(f"Embedding {len(to_embed)} new sequences")
|
646 |
if len(to_embed) > 0:
|
647 |
-
to_embed = sorted(to_embed, key=len, reverse=True)
|
648 |
dataset = ProteinDataset(to_embed)
|
649 |
-
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=
|
650 |
with torch.no_grad():
|
651 |
for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
|
652 |
seqs = to_embed[i * batch_size:(i + 1) * batch_size]
|
653 |
input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
|
654 |
-
|
655 |
-
|
656 |
-
embeddings = get_embeddings(residue_embeddings, attention_mask)
|
657 |
-
|
658 |
for seq, emb, mask in zip(seqs, embeddings, attention_mask):
|
659 |
if full_embeddings:
|
660 |
emb = emb[mask.bool()]
|
@@ -669,32 +728,75 @@ class PreTrainedESMplusplusModel(PreTrainedModel):
|
|
669 |
return None
|
670 |
|
671 |
embeddings_dict = {}
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
687 |
return embeddings_dict
|
688 |
|
689 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
690 |
### ESM++ Models
|
691 |
-
class ESMplusplusModel(PreTrainedESMplusplusModel):
|
692 |
"""
|
693 |
ESM++ model. transformer model with no heads
|
694 |
"""
|
695 |
config_class = ESMplusplusConfig
|
696 |
def __init__(self, config: ESMplusplusConfig, **kwargs):
|
697 |
-
super().__init__(config, **kwargs)
|
698 |
self.config = config
|
699 |
self.vocab_size = config.vocab_size
|
700 |
self.embed = nn.Embedding(self.vocab_size, config.hidden_size)
|
@@ -708,6 +810,10 @@ class ESMplusplusModel(PreTrainedESMplusplusModel):
|
|
708 |
def set_input_embeddings(self, value):
|
709 |
self.embed = value
|
710 |
|
|
|
|
|
|
|
|
|
711 |
def forward(
|
712 |
self,
|
713 |
input_ids: Optional[torch.Tensor] = None,
|
@@ -736,14 +842,14 @@ class ESMplusplusModel(PreTrainedESMplusplusModel):
|
|
736 |
return self.transformer(x, attention_mask, output_hidden_states, output_attentions)
|
737 |
|
738 |
|
739 |
-
class ESMplusplusForMaskedLM(PreTrainedESMplusplusModel):
|
740 |
"""
|
741 |
ESM++ model for masked language modeling.
|
742 |
Implements the base ESM++ architecture with a masked language modeling head.
|
743 |
"""
|
744 |
config_class = ESMplusplusConfig
|
745 |
def __init__(self, config: ESMplusplusConfig, **kwargs):
|
746 |
-
super().__init__(config, **kwargs)
|
747 |
self.config = config
|
748 |
self.vocab_size = config.vocab_size
|
749 |
self.embed = nn.Embedding(self.vocab_size, config.hidden_size)
|
@@ -765,6 +871,10 @@ class ESMplusplusForMaskedLM(PreTrainedESMplusplusModel):
|
|
765 |
def set_output_embeddings(self, new_embeddings):
|
766 |
self.sequence_head[-1] = new_embeddings
|
767 |
|
|
|
|
|
|
|
|
|
768 |
def forward(
|
769 |
self,
|
770 |
input_ids: Optional[torch.Tensor] = None,
|
@@ -807,13 +917,13 @@ class ESMplusplusForMaskedLM(PreTrainedESMplusplusModel):
|
|
807 |
)
|
808 |
|
809 |
|
810 |
-
class ESMplusplusForSequenceClassification(ESMplusplusForMaskedLM):
|
811 |
"""
|
812 |
ESM++ model for sequence classification.
|
813 |
Extends the base ESM++ model with a classification head.
|
814 |
"""
|
815 |
def __init__(self, config: ESMplusplusConfig, **kwargs):
|
816 |
-
super().__init__(config, **kwargs)
|
817 |
self.config = config
|
818 |
self.num_labels = config.num_labels
|
819 |
self.classifier = RegressionHead(config.hidden_size * 2, config.num_labels, config.hidden_size * 4)
|
@@ -823,6 +933,10 @@ class ESMplusplusForSequenceClassification(ESMplusplusForMaskedLM):
|
|
823 |
self.bce = nn.BCEWithLogitsLoss()
|
824 |
self.init_weights()
|
825 |
|
|
|
|
|
|
|
|
|
826 |
def forward(
|
827 |
self,
|
828 |
input_ids: Optional[torch.Tensor] = None,
|
@@ -888,13 +1002,13 @@ class ESMplusplusForSequenceClassification(ESMplusplusForMaskedLM):
|
|
888 |
)
|
889 |
|
890 |
|
891 |
-
class ESMplusplusForTokenClassification(ESMplusplusForMaskedLM):
|
892 |
"""
|
893 |
ESM++ model for token classification.
|
894 |
Extends the base ESM++ model with a token classification head.
|
895 |
"""
|
896 |
def __init__(self, config: ESMplusplusConfig):
|
897 |
-
super().__init__(config)
|
898 |
self.config = config
|
899 |
self.num_labels = config.num_labels
|
900 |
self.classifier = RegressionHead(config.hidden_size, config.num_labels, config.hidden_size * 4)
|
@@ -902,6 +1016,10 @@ class ESMplusplusForTokenClassification(ESMplusplusForMaskedLM):
|
|
902 |
self.loss_fct = nn.CrossEntropyLoss()
|
903 |
self.init_weights()
|
904 |
|
|
|
|
|
|
|
|
|
905 |
def forward(
|
906 |
self,
|
907 |
input_ids: Optional[torch.Tensor] = None,
|
|
|
16 |
from dataclasses import dataclass
|
17 |
from functools import cache, partial
|
18 |
from pathlib import Path
|
19 |
+
from typing import Optional, Tuple, Union, List, Callable, Dict
|
20 |
from einops import rearrange, repeat
|
21 |
from huggingface_hub import snapshot_download
|
22 |
from tokenizers import Tokenizer
|
23 |
from tokenizers.models import BPE
|
24 |
from tokenizers.processors import TemplateProcessing
|
25 |
+
from torch.utils.data import Dataset as TorchDataset
|
26 |
+
from torch.utils.data import DataLoader
|
27 |
from tqdm.auto import tqdm
|
28 |
+
from transformers import PreTrainedModel, PreTrainedTokenizerFast, PreTrainedTokenizerBase, PretrainedConfig
|
29 |
from transformers.modeling_outputs import ModelOutput
|
30 |
|
31 |
|
|
|
502 |
)
|
503 |
|
504 |
|
505 |
+
### Support for embedding datasets with low code
|
506 |
+
class Pooler:
|
507 |
+
def __init__(self, pooling_types: List[str]):
|
508 |
+
self.pooling_types = pooling_types
|
509 |
+
self.pooling_options = {
|
510 |
+
'mean': self.mean_pooling,
|
511 |
+
'max': self.max_pooling,
|
512 |
+
'min': self.min_pooling,
|
513 |
+
'norm': self.norm_pooling,
|
514 |
+
'prod': self.prod_pooling,
|
515 |
+
'median': self.median_pooling,
|
516 |
+
'std': self.std_pooling,
|
517 |
+
'var': self.var_pooling,
|
518 |
+
'cls': self.cls_pooling,
|
519 |
+
}
|
520 |
+
|
521 |
+
def mean_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
522 |
+
if attention_mask is None:
|
523 |
+
return emb.mean(dim=1)
|
524 |
+
else:
|
525 |
+
attention_mask = attention_mask.unsqueeze(-1)
|
526 |
+
return (emb * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
|
527 |
+
|
528 |
+
def max_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
529 |
+
if attention_mask is None:
|
530 |
+
return emb.max(dim=1).values
|
531 |
+
else:
|
532 |
+
attention_mask = attention_mask.unsqueeze(-1)
|
533 |
+
return (emb * attention_mask).max(dim=1).values
|
534 |
+
|
535 |
+
def min_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
536 |
+
if attention_mask is None:
|
537 |
+
return emb.min(dim=1).values
|
538 |
+
else:
|
539 |
+
attention_mask = attention_mask.unsqueeze(-1)
|
540 |
+
return (emb * attention_mask).min(dim=1).values
|
541 |
+
|
542 |
+
def norm_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
543 |
+
if attention_mask is None:
|
544 |
+
return emb.norm(dim=1, p=2)
|
545 |
+
else:
|
546 |
+
attention_mask = attention_mask.unsqueeze(-1)
|
547 |
+
return (emb * attention_mask).norm(dim=1, p=2)
|
548 |
+
|
549 |
+
def prod_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
550 |
+
length = emb.shape[1]
|
551 |
+
if attention_mask is None:
|
552 |
+
return emb.prod(dim=1) / length
|
553 |
+
else:
|
554 |
+
attention_mask = attention_mask.unsqueeze(-1)
|
555 |
+
return ((emb * attention_mask).prod(dim=1) / attention_mask.sum(dim=1)) / length
|
556 |
+
|
557 |
+
def median_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
558 |
+
if attention_mask is None:
|
559 |
+
return emb.median(dim=1).values
|
560 |
+
else:
|
561 |
+
attention_mask = attention_mask.unsqueeze(-1)
|
562 |
+
return (emb * attention_mask).median(dim=1).values
|
563 |
+
|
564 |
+
def std_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
565 |
+
if attention_mask is None:
|
566 |
+
return emb.std(dim=1)
|
567 |
+
else:
|
568 |
+
attention_mask = attention_mask.unsqueeze(-1)
|
569 |
+
return (emb * attention_mask).std(dim=1)
|
570 |
+
|
571 |
+
def var_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
572 |
+
if attention_mask is None:
|
573 |
+
return emb.var(dim=1)
|
574 |
+
else:
|
575 |
+
attention_mask = attention_mask.unsqueeze(-1)
|
576 |
+
return (emb * attention_mask).var(dim=1)
|
577 |
+
|
578 |
+
def cls_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
|
579 |
+
return emb[:, 0, :]
|
580 |
+
|
581 |
+
def __call__(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # [mean, max]
|
582 |
+
final_emb = []
|
583 |
+
for pooling_type in self.pooling_types:
|
584 |
+
final_emb.append(self.pooling_options[pooling_type](emb, attention_mask)) # (b, d)
|
585 |
+
return torch.cat(final_emb, dim=-1) # (b, n_pooling_types * d)
|
586 |
+
|
587 |
+
|
588 |
+
class ProteinDataset(TorchDataset):
|
589 |
"""Simple dataset for protein sequences."""
|
590 |
def __init__(self, sequences: list[str]):
|
591 |
self.sequences = sequences
|
|
|
597 |
return self.sequences[idx]
|
598 |
|
599 |
|
600 |
+
def build_collator(tokenizer) -> Callable[[list[str]], tuple[torch.Tensor, torch.Tensor]]:
|
601 |
+
def _collate_fn(sequences: list[str]) -> tuple[torch.Tensor, torch.Tensor]:
|
602 |
+
"""Collate function for batching sequences."""
|
603 |
+
return tokenizer(sequences, return_tensors="pt", padding='longest', pad_to_multiple_of=8)
|
604 |
+
return _collate_fn
|
|
|
|
|
605 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
|
607 |
+
class EmbeddingMixin:
|
608 |
+
def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
609 |
+
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
|
610 |
|
611 |
@property
|
612 |
def device(self) -> torch.device:
|
613 |
"""Get the device of the model."""
|
614 |
return next(self.parameters()).device
|
615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
def _read_sequences_from_db(self, db_path: str) -> set[str]:
|
617 |
"""Read sequences from SQLite database."""
|
618 |
import sqlite3
|
|
|
629 |
|
630 |
def embed_dataset(
|
631 |
self,
|
632 |
+
sequences: List[str],
|
633 |
+
tokenizer: PreTrainedTokenizerBase,
|
634 |
batch_size: int = 2,
|
635 |
max_len: int = 512,
|
636 |
full_embeddings: bool = False,
|
637 |
+
embed_dtype: torch.dtype = torch.float32,
|
638 |
+
pooling_types: List[str] = ['mean'],
|
639 |
num_workers: int = 0,
|
640 |
sql: bool = False,
|
641 |
+
save: bool = True,
|
642 |
sql_db_path: str = 'embeddings.db',
|
643 |
+
save_path: str = 'embeddings.pth',
|
644 |
) -> Optional[dict[str, torch.Tensor]]:
|
645 |
"""Embed a dataset of protein sequences.
|
646 |
|
|
|
649 |
batch_size: Batch size for processing
|
650 |
max_len: Maximum sequence length
|
651 |
full_embeddings: Whether to return full residue-wise (True) embeddings or pooled (False)
|
|
|
652 |
pooling_type: Type of pooling ('mean' or 'cls')
|
653 |
num_workers: Number of workers for data loading, 0 for the main process
|
654 |
sql: Whether to store embeddings in SQLite database - will be stored in float32
|
|
|
656 |
|
657 |
Returns:
|
658 |
Dictionary mapping sequences to embeddings, or None if sql=True
|
659 |
+
|
660 |
+
Note:
|
661 |
+
- If sql=True, embeddings can only be stored in float32
|
662 |
+
- sql is ideal if you need to stream a very large dataset for training in real-time
|
663 |
+
- save=True is ideal if you can store the entire embedding dictionary in RAM
|
664 |
+
- sql will be used if it is True and save is True or False
|
665 |
+
- If your sql database or .pth file is already present, they will be scanned first for already embedded sequences
|
666 |
+
- Sequences will be truncated to max_len and sorted by length in descending order for faster processing
|
667 |
+
|
668 |
+
Example:
|
669 |
+
>>> embedder = EmbeddingMixin()
|
670 |
+
>>> embedding_dict = embedder.embed_dataset(
|
671 |
+
sequences=[
|
672 |
+
'MALWMRLLPLLALLALWGPDPAAA', ... # list of protein sequences
|
673 |
+
],
|
674 |
+
batch_size=2, # adjust for your GPU memory
|
675 |
+
max_len=512, # adjust for your needs
|
676 |
+
full_embeddings=False, # if True, no pooling is performed
|
677 |
+
embed_dtype=torch.float32, # cast to what dtype you want
|
678 |
+
pooling_type=['mean', 'cls'], # more than one pooling type will be concatenated together
|
679 |
+
num_workers=0, # if you have many cpu cores, we find that num_workers = 4 is fast for large datasets
|
680 |
+
sql=False, # if True, embeddings will be stored in SQLite database
|
681 |
+
sql_db_path='embeddings.db',
|
682 |
+
save=True, # if True, embeddings will be saved as a .pth file
|
683 |
+
save_path='embeddings.pth',
|
684 |
+
)
|
685 |
+
>>> # embedding_dict is a dictionary mapping sequences to their embeddings as tensors for .pth or numpy arrays for sql
|
686 |
"""
|
687 |
sequences = list(set([seq[:max_len] for seq in sequences]))
|
688 |
+
sequences = sorted(sequences, key=len, reverse=True)
|
689 |
+
collate_fn = build_collator(tokenizer)
|
690 |
device = self.device
|
691 |
+
pooler = Pooler(pooling_types) if not full_embeddings else None
|
692 |
|
693 |
def get_embeddings(residue_embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
694 |
+
if full_embeddings or residue_embeddings.ndim == 2: # if already pooled or want residue-wise embeddings
|
695 |
return residue_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
696 |
else:
|
697 |
+
return pooler(residue_embeddings, attention_mask)
|
698 |
|
|
|
699 |
if sql:
|
700 |
import sqlite3
|
701 |
conn = sqlite3.connect(sql_db_path)
|
|
|
706 |
print(f"Found {len(already_embedded)} already embedded sequences in {sql_db_path}")
|
707 |
print(f"Embedding {len(to_embed)} new sequences")
|
708 |
if len(to_embed) > 0:
|
|
|
709 |
dataset = ProteinDataset(to_embed)
|
710 |
+
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, shuffle=False)
|
711 |
with torch.no_grad():
|
712 |
for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
|
713 |
seqs = to_embed[i * batch_size:(i + 1) * batch_size]
|
714 |
input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
|
715 |
+
residue_embeddings = self._embed(input_ids, attention_mask).float() # sql requires float32
|
716 |
+
embeddings = get_embeddings(residue_embeddings, attention_mask).cpu()
|
|
|
|
|
717 |
for seq, emb, mask in zip(seqs, embeddings, attention_mask):
|
718 |
if full_embeddings:
|
719 |
emb = emb[mask.bool()]
|
|
|
728 |
return None
|
729 |
|
730 |
embeddings_dict = {}
|
731 |
+
if os.path.exists(save_path):
|
732 |
+
embeddings_dict = torch.load(save_path, map_location='cpu', weights_only=True)
|
733 |
+
to_embed = [seq for seq in sequences if seq not in embeddings_dict]
|
734 |
+
print(f"Found {len(embeddings_dict)} already embedded sequences in {save_path}")
|
735 |
+
print(f"Embedding {len(to_embed)} new sequences")
|
736 |
+
else:
|
737 |
+
to_embed = sequences
|
738 |
+
print(f"Embedding {len(to_embed)} new sequences")
|
739 |
+
|
740 |
+
if len(to_embed) > 0:
|
741 |
+
dataset = ProteinDataset(to_embed)
|
742 |
+
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, shuffle=False)
|
743 |
+
with torch.no_grad():
|
744 |
+
for i, batch in tqdm(enumerate(dataloader), total=len(dataloader), desc='Embedding batches'):
|
745 |
+
seqs = to_embed[i * batch_size:(i + 1) * batch_size]
|
746 |
+
input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
|
747 |
+
residue_embeddings = self._embed(input_ids, attention_mask)
|
748 |
+
embeddings = get_embeddings(residue_embeddings, attention_mask).to(embed_dtype).cpu()
|
749 |
+
for seq, emb in zip(seqs, embeddings):
|
750 |
+
embeddings_dict[seq] = emb
|
751 |
+
|
752 |
+
if save:
|
753 |
+
torch.save(embeddings_dict, save_path)
|
754 |
+
|
755 |
return embeddings_dict
|
756 |
|
757 |
|
758 |
+
class PreTrainedESMplusplusModel(PreTrainedModel):
|
759 |
+
"""
|
760 |
+
init weights for ESM++ models
|
761 |
+
"""
|
762 |
+
config_class = ESMplusplusConfig
|
763 |
+
base_model_prefix = "esm++"
|
764 |
+
supports_gradient_checkpointing = True
|
765 |
+
|
766 |
+
def _init_weights(self, module):
|
767 |
+
"""Initialize the weights"""
|
768 |
+
if isinstance(module, nn.Linear):
|
769 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
770 |
+
if module.bias is not None:
|
771 |
+
module.bias.data.zero_()
|
772 |
+
elif isinstance(module, nn.Embedding):
|
773 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
774 |
+
if module.padding_idx is not None:
|
775 |
+
module.weight.data[module.padding_idx].zero_()
|
776 |
+
elif isinstance(module, nn.LayerNorm):
|
777 |
+
if module.bias is not None:
|
778 |
+
module.bias.data.zero_()
|
779 |
+
module.weight.data.fill_(1.0)
|
780 |
+
|
781 |
+
@classmethod
|
782 |
+
def from_pretrained_esm(cls, model_name: str):
|
783 |
+
"""Load a pretrained ESM++ model."""
|
784 |
+
if '300' in model_name:
|
785 |
+
return ESMplusplus_300M()
|
786 |
+
elif '600' in model_name:
|
787 |
+
return ESMplusplus_600M()
|
788 |
+
else:
|
789 |
+
raise ValueError(f"Invalid model name: {model_name}")
|
790 |
+
|
791 |
+
|
792 |
### ESM++ Models
|
793 |
+
class ESMplusplusModel(PreTrainedESMplusplusModel, EmbeddingMixin):
|
794 |
"""
|
795 |
ESM++ model. transformer model with no heads
|
796 |
"""
|
797 |
config_class = ESMplusplusConfig
|
798 |
def __init__(self, config: ESMplusplusConfig, **kwargs):
|
799 |
+
super(PreTrainedESMplusplusModel, self).__init__(config, **kwargs)
|
800 |
self.config = config
|
801 |
self.vocab_size = config.vocab_size
|
802 |
self.embed = nn.Embedding(self.vocab_size, config.hidden_size)
|
|
|
810 |
def set_input_embeddings(self, value):
|
811 |
self.embed = value
|
812 |
|
813 |
+
def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
814 |
+
x = self.embed(input_ids)
|
815 |
+
return self.transformer(x, attention_mask, output_hidden_states=False, output_attentions=False).last_hidden_state
|
816 |
+
|
817 |
def forward(
|
818 |
self,
|
819 |
input_ids: Optional[torch.Tensor] = None,
|
|
|
842 |
return self.transformer(x, attention_mask, output_hidden_states, output_attentions)
|
843 |
|
844 |
|
845 |
+
class ESMplusplusForMaskedLM(PreTrainedESMplusplusModel, EmbeddingMixin):
|
846 |
"""
|
847 |
ESM++ model for masked language modeling.
|
848 |
Implements the base ESM++ architecture with a masked language modeling head.
|
849 |
"""
|
850 |
config_class = ESMplusplusConfig
|
851 |
def __init__(self, config: ESMplusplusConfig, **kwargs):
|
852 |
+
super(PreTrainedESMplusplusModel, self).__init__(config, **kwargs)
|
853 |
self.config = config
|
854 |
self.vocab_size = config.vocab_size
|
855 |
self.embed = nn.Embedding(self.vocab_size, config.hidden_size)
|
|
|
871 |
def set_output_embeddings(self, new_embeddings):
|
872 |
self.sequence_head[-1] = new_embeddings
|
873 |
|
874 |
+
def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
875 |
+
x = self.embed(input_ids)
|
876 |
+
return self.transformer(x, attention_mask, output_hidden_states=False, output_attentions=False).last_hidden_state
|
877 |
+
|
878 |
def forward(
|
879 |
self,
|
880 |
input_ids: Optional[torch.Tensor] = None,
|
|
|
917 |
)
|
918 |
|
919 |
|
920 |
+
class ESMplusplusForSequenceClassification(ESMplusplusForMaskedLM, EmbeddingMixin):
|
921 |
"""
|
922 |
ESM++ model for sequence classification.
|
923 |
Extends the base ESM++ model with a classification head.
|
924 |
"""
|
925 |
def __init__(self, config: ESMplusplusConfig, **kwargs):
|
926 |
+
super(ESMplusplusForMaskedLM, self).__init__(config, **kwargs)
|
927 |
self.config = config
|
928 |
self.num_labels = config.num_labels
|
929 |
self.classifier = RegressionHead(config.hidden_size * 2, config.num_labels, config.hidden_size * 4)
|
|
|
933 |
self.bce = nn.BCEWithLogitsLoss()
|
934 |
self.init_weights()
|
935 |
|
936 |
+
def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
937 |
+
x = self.embed(input_ids)
|
938 |
+
return self.transformer(x, attention_mask, output_hidden_states=False, output_attentions=False).last_hidden_state
|
939 |
+
|
940 |
def forward(
|
941 |
self,
|
942 |
input_ids: Optional[torch.Tensor] = None,
|
|
|
1002 |
)
|
1003 |
|
1004 |
|
1005 |
+
class ESMplusplusForTokenClassification(ESMplusplusForMaskedLM, EmbeddingMixin):
|
1006 |
"""
|
1007 |
ESM++ model for token classification.
|
1008 |
Extends the base ESM++ model with a token classification head.
|
1009 |
"""
|
1010 |
def __init__(self, config: ESMplusplusConfig):
|
1011 |
+
super(ESMplusplusForMaskedLM, self).__init__(config)
|
1012 |
self.config = config
|
1013 |
self.num_labels = config.num_labels
|
1014 |
self.classifier = RegressionHead(config.hidden_size, config.num_labels, config.hidden_size * 4)
|
|
|
1016 |
self.loss_fct = nn.CrossEntropyLoss()
|
1017 |
self.init_weights()
|
1018 |
|
1019 |
+
def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
1020 |
+
x = self.embed(input_ids)
|
1021 |
+
return self.transformer(x, attention_mask, output_hidden_states=False, output_attentions=False).last_hidden_state
|
1022 |
+
|
1023 |
def forward(
|
1024 |
self,
|
1025 |
input_ids: Optional[torch.Tensor] = None,
|