Spaces:

AI4PD
/

hexviz

Sleeping

App Files Files Community

aksell commited on Mar 27, 2023

Commit

cfba77f

•

1 Parent(s): 5b6d16d

Add ZymCTRL model

Browse files

Needs some tweaking. Massive attention to the same residue
makes weird disks in the attention bars. It is also noticeably
slow on a CPU only instance.

Files changed (3) hide show

hexviz/app.py +1 -0
hexviz/attention.py +49 -6
tests/test_attention.py +35 -4

hexviz/app.py CHANGED Viewed

@@ -9,6 +9,7 @@ st.title("pLM Attention Visualization")
 # Define list of model types
 models = [
     Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
     # Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]

 # Define list of model types
 models = [
+    Model(name=ModelType.ZymCTRL, layers=36, heads=16),
     Model(name=ModelType.TAPE_BERT, layers=12, heads=12),
     # Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]

hexviz/attention.py CHANGED Viewed

@@ -7,12 +7,14 @@ import streamlit as st
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
 from tape import ProteinBertModel, TAPETokenizer
-from transformers import T5EncoderModel, T5Tokenizer
 class ModelType(str, Enum):
     TAPE_BERT = "bert-base"
     PROT_T5 = "prot_t5_xl_half_uniref50-enc"
 class Model:
@@ -45,7 +47,7 @@ def get_sequences(structure: Structure) -> List[str]:
         # TODO ask if using protein_letters_3to1_extended makes sense
         residues_single_letter = map(lambda x: Polypeptide.protein_letters_3to1.get(x, "X"), residues)
-        sequences.append(list(residues_single_letter))
     return sequences
 @st.cache
@@ -69,18 +71,59 @@ def get_tape_bert() -> Tuple[TAPETokenizer, ProteinBertModel]:
     return tokenizer, model
 @st.cache
 def get_attention(
     sequence: str, model_type: ModelType = ModelType.TAPE_BERT
 ):
     if model_type == ModelType.TAPE_BERT:
         tokenizer, model = get_tape_bert()
         token_idxs = tokenizer.encode(sequence).tolist()
         inputs = torch.tensor(token_idxs).unsqueeze(0)
         with torch.no_grad():
-            attns = model(inputs)[-1]
             # Remove attention from <CLS> (first) and <SEP> (last) token
-        attns = [attn[:, :, 1:-1, 1:-1] for attn in attns]
-        attns = torch.stack([attn.squeeze(0) for attn in attns])
     elif model_type == ModelType.PROT_T5:
             device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
             # Introduce white-space between all amino acids
@@ -98,7 +141,7 @@ def get_attention(
     else:
         raise ValueError(f"Model {model_type} not supported")
-    return attns
 def unidirectional_sum_filtered(attention, layer, head, threshold):
     num_layers, num_heads, seq_len, _ = attention.shape

 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
 from tape import ProteinBertModel, TAPETokenizer
+from transformers import (AutoTokenizer, GPT2LMHeadModel, T5EncoderModel,
+                          T5Tokenizer)
 class ModelType(str, Enum):
     TAPE_BERT = "bert-base"
     PROT_T5 = "prot_t5_xl_half_uniref50-enc"
+    ZymCTRL = "zymctrl"
 class Model:
         # TODO ask if using protein_letters_3to1_extended makes sense
         residues_single_letter = map(lambda x: Polypeptide.protein_letters_3to1.get(x, "X"), residues)
+        sequences.append("".join(list(residues_single_letter)))
     return sequences
 @st.cache
     return tokenizer, model
 @st.cache
+def get_zymctrl() -> Tuple[AutoTokenizer, GPT2LMHeadModel]:
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained('nferruz/ZymCTRL')
+    model = GPT2LMHeadModel.from_pretrained('nferruz/ZymCTRL').to(device)
+    return tokenizer, model
 def get_attention(
     sequence: str, model_type: ModelType = ModelType.TAPE_BERT
 ):
+    """
+    Returns a tensor of shape [n_layers, n_heads, n_res, n_res] with attention weights
+    """
     if model_type == ModelType.TAPE_BERT:
         tokenizer, model = get_tape_bert()
         token_idxs = tokenizer.encode(sequence).tolist()
         inputs = torch.tensor(token_idxs).unsqueeze(0)
         with torch.no_grad():
+            attentions = model(inputs)[-1]
             # Remove attention from <CLS> (first) and <SEP> (last) token
+        attentions = [attention[:, :, 1:-1, 1:-1] for attention in attentions]
+        attentions = torch.stack([attention.squeeze(0) for attention in attentions])
+    elif model_type == ModelType.ZymCTRL:
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        tokenizer, model = get_zymctrl()
+        inputs = tokenizer(sequence, return_tensors='pt').input_ids.to(device)
+        attention_mask = tokenizer(sequence, return_tensors='pt').attention_mask.to(device)
+        with torch.no_grad():
+            outputs = model(inputs, attention_mask=attention_mask, output_attentions=True)
+            attentions = outputs.attentions
+        if attentions[0].shape[-1] == attentions[0].shape[-2] == 1:
+            reshaped = [attention.view(attention.shape[1], attention.shape[0]) for attention in attentions]
+            n_residues = reshaped[0].shape[-1]
+            n_heads = 16
+            i,j = torch.triu_indices(n_residues, n_residues)
+            attentions_symmetric = []
+            # Make symmetric attention matrix
+            for attention in reshaped:
+                x = torch.zeros(n_heads, n_residues, n_residues)
+                x[:,i,j] = attention
+                x[:,j,i] = attention
+                attentions_symmetric.append(x)
+            attentions = torch.stack([attention for attention in attentions_symmetric])
+        else:
+            # torch.Size([1, n_heads, n_res, n_res]) -> torch.Size([n_heads, n_res, n_res])
+            attention_squeezed = [torch.squeeze(attention) for attention in attentions]
+            # ([n_heads, n_res, n_res]*n_layers) -> [n_layers, n_heads, n_res, n_res]
+            attention_stacked = torch.stack([attention for attention in attention_squeezed])
+            attentions = attention_stacked
     elif model_type == ModelType.PROT_T5:
             device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
             # Introduce white-space between all amino acids
     else:
         raise ValueError(f"Model {model_type} not supported")
+    return attentions
 def unidirectional_sum_filtered(attention, layer, head, threshold):
     num_layers, num_heads, seq_len, _ = attention.shape

tests/test_attention.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import torch
 from Bio.PDB.Structure import Structure
-from transformers import T5EncoderModel, T5Tokenizer
 from hexviz.attention import (ModelType, get_attention, get_protT5,
-                              get_sequences, get_structure,
                               unidirectional_sum_filtered)
@@ -37,12 +38,42 @@ def test_get_protT5():
     assert isinstance(tokenizer, T5Tokenizer)
     assert isinstance(model, T5EncoderModel)
 def test_get_attention_tape():
-    result = get_attention("1AKE", model=ModelType.TAPE_BERT)
     assert result is not None
-    assert result.shape == torch.Size([12,12,456,456])
 def test_get_unidirection_sum_filtered():
     # 1 head, 1 layer, 4 residues long attention tensor

 import torch
 from Bio.PDB.Structure import Structure
+from transformers import (GPT2LMHeadModel, GPT2TokenizerFast, T5EncoderModel,
+                          T5Tokenizer)
 from hexviz.attention import (ModelType, get_attention, get_protT5,
+                              get_sequences, get_structure, get_zymctrl,
                               unidirectional_sum_filtered)
     assert isinstance(tokenizer, T5Tokenizer)
     assert isinstance(model, T5EncoderModel)
+def test_get_zymctrl():
+    result = get_zymctrl()
+    assert result is not None
+    assert isinstance(result, tuple)
+    tokenizer, model = result
+    assert isinstance(tokenizer, GPT2TokenizerFast)
+    assert isinstance(model, GPT2LMHeadModel)
+def test_get_attention_zymctrl():
+    result = get_attention("GGG", model_type=ModelType.ZymCTRL)
+    assert result is not None
+    assert result.shape == torch.Size([36,16,3,3])
+def test_get_attention_zymctrl_long_chain():
+    structure = get_structure(pdb_code="6A5J") # 13 residues long
+    sequences = get_sequences(structure)
+    result = get_attention(sequences[0], model_type=ModelType.ZymCTRL)
+    assert result is not None
+    assert result.shape == torch.Size([36,16,13,13])
 def test_get_attention_tape():
+    structure = get_structure(pdb_code="6A5J") # 13 residues long
+    sequences = get_sequences(structure)
+    result = get_attention(sequences[0], model_type=ModelType.TAPE_BERT)
     assert result is not None
+    assert result.shape == torch.Size([12,12,13,13])
 def test_get_unidirection_sum_filtered():
     # 1 head, 1 layer, 4 residues long attention tensor