Spaces:

AI4PD
/

hexviz

Sleeping

aksell commited on Mar 30, 2023

Commit

1c423ed

•

1 Parent(s): 02840f4

Add PROT BERT

Files changed (3) hide show

hexviz/attention.py CHANGED Viewed

@@ -6,11 +6,11 @@ from urllib import request
 import streamlit as st
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
-from models import (ModelType, get_protgpt2, get_protT5, get_tape_bert,
-                    get_zymctrl)
-@st.cache
 def get_structure(pdb_code: str) -> Structure:
     """
     Get structure from PDB
@@ -83,6 +83,17 @@ def get_attention(
         # ([n_heads, n_res, n_res]*n_layers) -> [n_layers, n_heads, n_res, n_res]
         attention_stacked = torch.stack([attention for attention in attention_squeezed])
         attentions = attention_stacked
     elif model_type == ModelType.PROT_T5:
         # Introduce white-space between all amino acids

 import streamlit as st
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
+from hexviz.models import (ModelType, get_prot_bert, get_protgpt2, get_protT5,
+                           get_tape_bert, get_zymctrl)
 def get_structure(pdb_code: str) -> Structure:
     """
     Get structure from PDB
         # ([n_heads, n_res, n_res]*n_layers) -> [n_layers, n_heads, n_res, n_res]
         attention_stacked = torch.stack([attention for attention in attention_squeezed])
         attentions = attention_stacked
+        # TODO extend attentions to be per token, not per word piece
+        # simplest way to draw attention for multi residue token models for now
+    elif model_type == ModelType.PROT_BERT:
+        tokenizer, model = get_prot_bert()
+        token_idxs = tokenizer.encode(sequence)
+        inputs = torch.tensor(token_idxs).unsqueeze(0)
+        with torch.no_grad():
+            attentions = model(inputs)[-1]
+            # Remove attention from <CLS> (first) and <SEP> (last) token
+        attentions = [attention[:, :, 1:-1, 1:-1] for attention in attentions]
+        attentions = torch.stack([attention.squeeze(0) for attention in attentions])
     elif model_type == ModelType.PROT_T5:
         # Introduce white-space between all amino acids

hexviz/models.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import Tuple
 import streamlit as st
 import torch
 from tape import ProteinBertModel, TAPETokenizer
-from transformers import (AutoTokenizer, GPT2LMHeadModel, T5EncoderModel,
-                          T5Tokenizer)
 class ModelType(str, Enum):
@@ -13,6 +13,7 @@ class ModelType(str, Enum):
     PROT_T5 = "prot_t5_xl_half_uniref50-enc"
     ZymCTRL = "ZymCTRL"
     ProtGPT2 = "ProtGPT2"
 class Model:
@@ -42,6 +43,12 @@ def get_tape_bert() -> Tuple[TAPETokenizer, ProteinBertModel]:
     model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
 @st.cache
 def get_zymctrl() -> Tuple[AutoTokenizer, GPT2LMHeadModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

 import streamlit as st
 import torch
 from tape import ProteinBertModel, TAPETokenizer
+from transformers import (AutoTokenizer, BertForMaskedLM, BertTokenizer,
+                          GPT2LMHeadModel, T5EncoderModel, T5Tokenizer)
 class ModelType(str, Enum):
     PROT_T5 = "prot_t5_xl_half_uniref50-enc"
     ZymCTRL = "ZymCTRL"
     ProtGPT2 = "ProtGPT2"
+    PROT_BERT = "ProtBert"
 class Model:
     model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
+@st.cache
+def get_prot_bert() -> Tuple[BertTokenizer, BertForMaskedLM]:
+    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
+    model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert")
+    return tokenizer, model
 @st.cache
 def get_zymctrl() -> Tuple[AutoTokenizer, GPT2LMHeadModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tests/test_attention.py CHANGED Viewed

@@ -51,6 +51,13 @@ def test_get_attention_tape():
     assert result is not None
     assert result.shape == torch.Size([12,12,13,13])
 def test_get_unidirection_sum_filtered():
     # 1 head, 1 layer, 4 residues long attention tensor
     attention= torch.tensor([[[[1, 2, 3, 4],

     assert result is not None
     assert result.shape == torch.Size([12,12,13,13])
+def test_get_attention_prot_bert():
+    result = get_attention("GGG", model_type=ModelType.PROT_BERT)
+    assert result is not None
+    assert result.shape == torch.Size([30, 16, 3, 3])
 def test_get_unidirection_sum_filtered():
     # 1 head, 1 layer, 4 residues long attention tensor
     attention= torch.tensor([[[[1, 2, 3, 4],