Spaces:

AI4PD
/

hexviz

Sleeping

App Files Files Community

aksell commited on Mar 30, 2023

Commit

386fb31

•

1 Parent(s): 6002f51

Remove Prot-T5 and ProtGPT2

Browse files

The attention visualization was not implemented since these
models don't tokenize single residues, and the models are
too large to be loaded with the current infrastructure.

Could be added back in the future if needed.

Files changed (3) hide show

hexviz/attention.py +1 -29
hexviz/models.py +4 -27
tests/test_models.py +2 -14

hexviz/attention.py CHANGED Viewed

@@ -7,8 +7,7 @@ import streamlit as st
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
-from hexviz.models import (ModelType, get_prot_bert, get_protgpt2, get_protT5,
-                           get_tape_bert, get_zymctrl)
 def get_structure(pdb_code: str) -> Structure:
@@ -71,20 +70,6 @@ def get_attention(
         attention_stacked = torch.stack([attention for attention in attention_squeezed])
         attentions = attention_stacked
-    elif model_type == ModelType.ProtGPT2:
-        tokenizer, model = get_protgpt2()
-        input_ids = tokenizer.encode(input, return_tensors='pt').to(device)
-        with torch.no_grad():
-            outputs = model(inputs, attention_mask=attention_mask, output_attentions=True)
-            attentions = outputs.attentions
-        # torch.Size([1, n_heads, n_res, n_res]) -> torch.Size([n_heads, n_res, n_res])
-        attention_squeezed = [torch.squeeze(attention) for attention in attentions]
-        # ([n_heads, n_res, n_res]*n_layers) -> [n_layers, n_heads, n_res, n_res]
-        attention_stacked = torch.stack([attention for attention in attention_squeezed])
-        attentions = attention_stacked
-        # TODO extend attentions to be per token, not per word piece
-        # simplest way to draw attention for multi residue token models for now
     elif model_type == ModelType.PROT_BERT:
         tokenizer, model = get_prot_bert()
         token_idxs = tokenizer.encode(sequence)
@@ -95,19 +80,6 @@ def get_attention(
         attentions = [attention[:, :, 1:-1, 1:-1] for attention in attentions]
         attentions = torch.stack([attention.squeeze(0) for attention in attentions])
-    elif model_type == ModelType.PROT_T5:
-        # Introduce white-space between all amino acids
-        sequence = " ".join(sequence)
-        # tokenize sequences and pad up to the longest sequence in the batch
-        ids = tokenizer.encode_plus(sequence, add_special_tokens=True, padding="longest")
-        input_ids = torch.tensor(ids['input_ids']).to(device)
-        attention_mask = torch.tensor(ids['attention_mask']).to(device)
-        with torch.no_grad():
-            attns = model(input_ids=input_ids,attention_mask=attention_mask)[-1]
-        tokenizer, model = get_protT5()
     else:
         raise ValueError(f"Model {model_type} not supported")

 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
+from hexviz.models import ModelType, get_prot_bert, get_tape_bert, get_zymctrl
 def get_structure(pdb_code: str) -> Structure:
         attention_stacked = torch.stack([attention for attention in attention_squeezed])
         attentions = attention_stacked
     elif model_type == ModelType.PROT_BERT:
         tokenizer, model = get_prot_bert()
         token_idxs = tokenizer.encode(sequence)
         attentions = [attention[:, :, 1:-1, 1:-1] for attention in attentions]
         attentions = torch.stack([attention.squeeze(0) for attention in attentions])
     else:
         raise ValueError(f"Model {model_type} not supported")

hexviz/models.py CHANGED Viewed

@@ -5,14 +5,12 @@ import streamlit as st
 import torch
 from tape import ProteinBertModel, TAPETokenizer
 from transformers import (AutoTokenizer, BertForMaskedLM, BertTokenizer,
-                          GPT2LMHeadModel, T5EncoderModel, T5Tokenizer)
 class ModelType(str, Enum):
     TAPE_BERT = "TAPE-BERT"
-    PROT_T5 = "prot_t5_xl_half_uniref50-enc"
     ZymCTRL = "ZymCTRL"
-    ProtGPT2 = "ProtGPT2"
     PROT_BERT = "ProtBert"
@@ -22,20 +20,6 @@ class Model:
         self.layers: int = layers
         self.heads: int = heads
-@st.cache
-def get_protT5() -> Tuple[T5Tokenizer, T5EncoderModel]:
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    tokenizer = T5Tokenizer.from_pretrained(
-        "Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False
-    )
-    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(
-        device
-    )
-    model.full() if device == "cpu" else model.half()
-    return tokenizer, model
 @st.cache
 def get_tape_bert() -> Tuple[TAPETokenizer, ProteinBertModel]:
@@ -43,12 +27,6 @@ def get_tape_bert() -> Tuple[TAPETokenizer, ProteinBertModel]:
     model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
-@st.cache
-def get_prot_bert() -> Tuple[BertTokenizer, BertForMaskedLM]:
-    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
-    model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert")
-    return tokenizer, model
 @st.cache
 def get_zymctrl() -> Tuple[AutoTokenizer, GPT2LMHeadModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -57,8 +35,7 @@ def get_zymctrl() -> Tuple[AutoTokenizer, GPT2LMHeadModel]:
     return tokenizer, model
 @st.cache
-def get_protgpt2() -> Tuple[AutoTokenizer, GPT2LMHeadModel]:
-    device = torch.device('cuda')
-    tokenizer = AutoTokenizer.from_pretrained('nferruz/ProtGPT2')
-    model = GPT2LMHeadModel.from_pretrained('nferruz/ProtGPT2').to(device)
     return tokenizer, model

 import torch
 from tape import ProteinBertModel, TAPETokenizer
 from transformers import (AutoTokenizer, BertForMaskedLM, BertTokenizer,
+                          GPT2LMHeadModel)
 class ModelType(str, Enum):
     TAPE_BERT = "TAPE-BERT"
     ZymCTRL = "ZymCTRL"
     PROT_BERT = "ProtBert"
         self.layers: int = layers
         self.heads: int = heads
 @st.cache
 def get_tape_bert() -> Tuple[TAPETokenizer, ProteinBertModel]:
     model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
 @st.cache
 def get_zymctrl() -> Tuple[AutoTokenizer, GPT2LMHeadModel]:
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     return tokenizer, model
 @st.cache
+def get_prot_bert() -> Tuple[BertTokenizer, BertForMaskedLM]:
+    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
+    model = BertForMaskedLM.from_pretrained("Rostlab/prot_bert")
     return tokenizer, model

tests/test_models.py CHANGED Viewed

@@ -1,21 +1,9 @@
-from transformers import (GPT2LMHeadModel, GPT2TokenizerFast, T5EncoderModel,
-                          T5Tokenizer)
-from hexviz.models import get_protT5, get_zymctrl
-def test_get_protT5():
-    result = get_protT5()
-    assert result is not None
-    assert isinstance(result, tuple)
-    tokenizer, model = result
-    assert isinstance(tokenizer, T5Tokenizer)
-    assert isinstance(model, T5EncoderModel)
 def test_get_zymctrl():
     result = get_zymctrl()

+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+from hexviz.models import get_zymctrl
 def test_get_zymctrl():
     result = get_zymctrl()