File size: 7,403 Bytes
555e708 3b48f52 555e708 b72e236 c911855 59de2c5 4a15c97 59de2c5 4a15c97 59de2c5 4a15c97 59de2c5 e96ac65 59de2c5 e66b0e5 59de2c5 4a15c97 59de2c5 e66b0e5 59de2c5 3aead7c 921086f 59de2c5 3aead7c 59de2c5 921086f 59de2c5 91a0e26 59de2c5 4a15c97 59de2c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
## Abstract
Current protein language models (PLMs) learn protein representations mainly based on their sequences, thereby well capturing co-evolutionary information, but they are unable to explicitly acquire protein functions, which is the end goal of protein representation learning. Fortunately, for many proteins, their textual property descriptions are available, where their various functions are also described. Motivated by this fact, we first build the ProtDescribe dataset to augment protein sequences with text descriptions of their functions and other important properties. Based on this dataset, we propose the [ProtST framework](https://arxiv.org/abs/2301.12040) to enhance Protein Sequence pre-training and understanding by biomedical Texts. During pre-training, we design three types of tasks, i.e., unimodal mask prediction, multimodal representation alignment and multimodal mask prediction, to enhance a PLM with protein property information with different granularities and, at the same time, preserve the PLM’s original representation power. On downstream tasks, ProtST enables both supervised learning and zeroshot prediction. We verify the superiority of ProtST-induced PLMs over previous ones on diverse representation learning benchmarks. Under the zero-shot setting, we show the effectiveness of ProtST on zero-shot protein classification, and ProtST also enables functional protein retrieval from a large-scale database without any function annotation. Source code and model weights are available at [https://github.com/DeepGraphLearning/ProtST](https://github.com/DeepGraphLearning/ProtST).
![image/png](https://cdn-uploads.huggingface.co/production/uploads/62f0a673f0d40f6aae296b4a/o4F5-Cm-gGdHPpX5rPVKx.png)
## Example
The following script shows how to run ProtST with Gaudi on zero-shot classification task.
```diff
import logging
import functools
from tqdm import tqdm
import torch
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer, AutoConfig
+ import habana_frameworks.torch
logger = logging.getLogger(__name__)
def tokenize_protein(example, protein_tokenizer=None, padding=None):
protein_seqs = example["prot_seq"]
- protein_inputs = protein_tokenizer(protein_seqs, padding=padding, add_special_tokens=True)
+ protein_inputs = protein_tokenizer(protein_seqs, padding="max_length", truncation=True, add_special_tokens=True, max_length=1024)
example["protein_input_ids"] = protein_inputs.input_ids
example["protein_attention_mask"] = protein_inputs.attention_mask
return example
def label_embedding(labels, text_tokenizer, text_model, device):
# embed label descriptions
label_feature = []
with torch.inference_mode():
for label in labels:
label_input_ids = text_tokenizer.encode(label, max_length=128,
- truncation=True, add_special_tokens=False)
+ truncation=True, add_special_tokens=False, padding="max_length")
label_input_ids = [text_tokenizer.cls_token_id] + label_input_ids
label_input_ids = torch.tensor(label_input_ids, dtype=torch.long, device=device).unsqueeze(0)
attention_mask = label_input_ids != text_tokenizer.pad_token_id
attention_mask = attention_mask.to(device)
text_outputs = text_model(label_input_ids, attention_mask=attention_mask)
- label_feature.append(text_outputs["text_feature"])
+ label_feature.append(text_outputs["text_feature"].clone())
label_feature = torch.cat(label_feature, dim=0)
label_feature = label_feature / label_feature.norm(dim=-1, keepdim=True)
return label_feature
def zero_shot_eval(logger, device,
test_dataset, target_field, protein_model, logit_scale, label_feature):
# get prediction and target
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
preds, targets = [], []
with torch.inference_mode():
for data in tqdm(test_dataloader):
target = data[target_field]
targets.append(target)
protein_input_ids = torch.tensor(data["protein_input_ids"], dtype=torch.long, device=device).unsqueeze(0)
attention_mask = torch.tensor(data["protein_attention_mask"], dtype=torch.long, device=device).unsqueeze(0)
protein_outputs = protein_model(protein_input_ids, attention_mask=attention_mask)
protein_feature = protein_outputs["protein_feature"]
protein_feature = protein_feature / protein_feature.norm(dim=-1, keepdim=True)
pred = logit_scale * protein_feature @ label_feature.t()
preds.append(pred)
preds = torch.cat(preds, dim=0)
targets = torch.tensor(targets, dtype=torch.long, device=device)
accuracy = (preds.argmax(dim=-1) == targets).float().mean().item()
logger.warning("Zero-shot accuracy: %.6f" % accuracy)
if __name__ == "__main__":
# get datasets
raw_datasets = load_dataset("mila-intel/ProtST-SubcellularLocalization", cache_dir="~/.cache/huggingface/datasets", split='test') # cache_dir defaults to "~/.cache/huggingface/datasets"
- device = torch.device("cpu")
+ device = torch.device("hpu")
protst_model = AutoModel.from_pretrained("mila-intel/ProtST-esm1b", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
protein_model = protst_model.protein_model
text_model = protst_model.text_model
logit_scale = protst_model.logit_scale
+ from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+ protein_model = wrap_in_hpu_graph(protein_model)
+ text_model = wrap_in_hpu_graph(text_model)
logit_scale.requires_grad = False
logit_scale = logit_scale.to(device)
logit_scale = logit_scale.exp()
protein_tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
text_tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
func_tokenize_protein = functools.partial(tokenize_protein, protein_tokenizer=protein_tokenizer, padding=False)
test_dataset = raw_datasets.map(
func_tokenize_protein, batched=False,
remove_columns=["prot_seq"],
desc="Running tokenize_proteins on dataset",
)
labels = load_dataset("mila-intel/subloc_template", cache_dir="~/.cache/huggingface/datasets")["train"]["name"]
text_tokenizer.encode(labels[0], max_length=128, truncation=True, add_special_tokens=False)
label_feature = label_embedding(labels, text_tokenizer, text_model, device)
zero_shot_eval(logger, device, test_dataset, "localization",
protein_model, logit_scale, label_feature)
```
Run ProtST on CPU with [optimum-intel](https://github.com/huggingface/optimum-intel) optimization.
```diff
...
protst_model = AutoModel.from_pretrained("mila-intel/ProtST-esm1b", trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
protein_model = protst_model.protein_model
+ import intel_extension_for_pytorch as ipex
+ from optimum.intel.generation.modeling import jit_trace
+ protein_model = ipex.optimize(protein_model, dtype=torch.bfloat16, inplace=True)
+ protein_model = jit_trace(protein_model, "sequence-classification")
...
``` |