Spaces:

AI4PD
/

hexviz

Sleeping

App Files Files Community

aksell commited on Mar 23, 2023

Commit

58c7b8d

•

1 Parent(s): ebbe380

Visualize attention pairs on structure

Browse files

Files changed (3) hide show

protention/attention.py +42 -14
protention/streamlit/Attention_On_Structure.py +4 -2
tests/test_attention.py +23 -2

protention/attention.py CHANGED Viewed

@@ -66,21 +66,11 @@ def get_tape_bert() -> tuple[TAPETokenizer, ProteinBertModel]:
     model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
-@st.cache
 def get_attention(
-    pdb_code: str, model: ModelType = ModelType.TAPE_BERT
 ):
-    """
-    Get attention from T5
-    """
-    # fetch structure
-    structure = get_structure(pdb_code)
-    # Get list of sequences
-    sequences = get_sequences(structure)
-    # TODO handle multiple sequences
-    sequence = sequences[0]
-    match model.name:
         case ModelType.TAPE_BERT:
             tokenizer, model = get_tape_bert()
             token_idxs = tokenizer.encode(sequence).tolist()
@@ -91,9 +81,47 @@ def get_attention(
             attns = [attn[:, :, 1:-1, 1:-1] for attn in attns]
             attns = torch.stack([attn.squeeze(0) for attn in attns])
         case ModelType.PROT_T5:
             # Space separate sequences
             sequences = [" ".join(sequence) for sequence in sequences]
             tokenizer, model = get_protT5()
     return attns

     model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
     return tokenizer, model
+@st.cache_data
 def get_attention(
+    sequence: list[str], model_type: ModelType = ModelType.TAPE_BERT
 ):
+    match model_type:
         case ModelType.TAPE_BERT:
             tokenizer, model = get_tape_bert()
             token_idxs = tokenizer.encode(sequence).tolist()
             attns = [attn[:, :, 1:-1, 1:-1] for attn in attns]
             attns = torch.stack([attn.squeeze(0) for attn in attns])
         case ModelType.PROT_T5:
+            attns = None
             # Space separate sequences
             sequences = [" ".join(sequence) for sequence in sequences]
             tokenizer, model = get_protT5()
+        case _:
+            raise ValueError(f"Model {model_type} not supported")
     return attns
+def unidirectional_sum_filtered(attention, layer, head, threshold):
+    num_layers, num_heads, seq_len, _ = attention.shape
+    attention_head = attention[layer, head]
+    unidirectional_sum_for_head = []
+    for i in range(seq_len):
+        for j in range(i, seq_len):
+            # Attention matrices for BERT models are asymetric.
+            # Bidirectional attention is reduced to one value by adding the
+            # attention values
+            # TODO think... does this operation make sense?
+            sum = attention_head[i, j].item() + attention_head[j, i].item()
+            if sum >= threshold:
+                unidirectional_sum_for_head.append((sum, i, j))
+    return unidirectional_sum_for_head
+@st.cache_data
+def get_attention_pairs(pdb_code: str, layer: int, head: int, threshold: int = 0.2, model_type: ModelType = ModelType.TAPE_BERT):
+    # fetch structure
+    structure = get_structure(pdb_code=pdb_code)
+    # Get list of sequences
+    sequences = get_sequences(structure)
+    attention_pairs = []
+    for i, sequence in enumerate(sequences):
+        attention = get_attention(sequence=sequence, model_type=model_type)
+        attention_unidirectional = unidirectional_sum_filtered(attention, layer, head, threshold)
+        chain = list(structure.get_chains())[i]
+        for attn_value, res_1, res_2 in attention_unidirectional:
+            try:
+                coord_1 = chain[res_1]["CA"].coord.tolist()
+                coord_2 = chain[res_2]["CA"].coord.tolist()
+            except KeyError:
+                continue
+            attention_pairs.append((attn_value, coord_1, coord_2))
+    return attention_pairs

protention/streamlit/Attention_On_Structure.py CHANGED Viewed

@@ -3,7 +3,7 @@ import stmol
 import streamlit as st
 from stmol import showmol
-from protention.attention import Model, ModelType, get_attention
 st.sidebar.title("pLM Attention Visualization")
@@ -27,12 +27,14 @@ with right:
 min_attn = st.slider("Minimum attention", min_value=0.0, max_value=0.4, value=0.15)
-attention = get_attention(pdb_id, model=selected_model.name)
 def get_3dview(pdb):
     xyzview = py3Dmol.view(query=f"pdb:{pdb}")
     xyzview.setStyle({"cartoon": {"color": "spectrum"}})
     stmol.add_hover(xyzview, backgroundColor="black", fontColor="white")
     return xyzview

 import streamlit as st
 from stmol import showmol
+from protention.attention import Model, ModelType, get_attention_pairs
 st.sidebar.title("pLM Attention Visualization")
 min_attn = st.slider("Minimum attention", min_value=0.0, max_value=0.4, value=0.15)
+attention_pairs = get_attention_pairs(pdb_id, layer, head, min_attn, model_type=selected_model.name)
 def get_3dview(pdb):
     xyzview = py3Dmol.view(query=f"pdb:{pdb}")
     xyzview.setStyle({"cartoon": {"color": "spectrum"}})
     stmol.add_hover(xyzview, backgroundColor="black", fontColor="white")
+    for att_weight, first, second in attention_pairs:
+        stmol.add_cylinder(xyzview, start=first, end=second, cylradius=att_weight*3, cylColor='red', dashed=False)
     return xyzview

tests/test_attention.py CHANGED Viewed

@@ -3,7 +3,8 @@ from Bio.PDB.Structure import Structure
 from transformers import T5EncoderModel, T5Tokenizer
 from protention.attention import (ModelType, get_attention, get_protT5,
-                                  get_sequences, get_structure)
 def test_get_structure():
@@ -38,7 +39,27 @@ def test_get_protT5():
 def test_get_attention_tape():
-    result = get_attention("1AKE", model=ModelType.tape_bert)
     assert result is not None
     assert result.shape == torch.Size([12,12,456,456])

 from transformers import T5EncoderModel, T5Tokenizer
 from protention.attention import (ModelType, get_attention, get_protT5,
+                                  get_sequences, get_structure,
+                                  unidirectional_sum_filtered)
 def test_get_structure():
 def test_get_attention_tape():
+    result = get_attention("1AKE", model=ModelType.TAPE_BERT)
     assert result is not None
     assert result.shape == torch.Size([12,12,456,456])
+def test_get_unidirection_sum_filtered():
+    # 1 head, 1 layer, 4 residues long attention tensor
+    attention= torch.tensor([[[[1, 2, 3, 4],
+                               [2, 5, 6, 7],
+                               [3, 6, 8, 9],
+                               [4, 7, 9, 11]]]], dtype=torch.float32)
+    result = unidirectional_sum_filtered(attention, 0, 0, 0)
+    assert result is not None
+    assert len(result) == 10
+    attention= torch.tensor([[[[1, 2, 3],
+                               [2, 5, 6],
+                               [4, 7, 91]]]], dtype=torch.float32)
+    result = unidirectional_sum_filtered(attention, 0, 0, 0)
+    assert len(result) == 6