aksell commited on
Commit
b7ab123
1 Parent(s): 87c0dbc

Add get_sequence

Browse files
Files changed (2) hide show
  1. protention/attention.py +18 -1
  2. tests/test_attention.py +12 -1
protention/attention.py CHANGED
@@ -2,7 +2,7 @@ from io import StringIO
2
  from urllib import request
3
 
4
  import torch
5
- from Bio.PDB import PDBParser, Structure
6
  from transformers import T5EncoderModel, T5Tokenizer
7
 
8
 
@@ -17,6 +17,20 @@ def get_structure(pdb_code: str) -> Structure:
17
  structure = parser.get_structure(pdb_code, file)
18
  return structure
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def get_protT5() -> tuple[T5Tokenizer, T5EncoderModel]:
22
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -41,11 +55,14 @@ def get_attention(
41
  """
42
  # fetch structure
43
  structure = get_structure(pdb_code)
 
 
44
 
45
  # get model
46
  tokenizer, model = get_protT5()
47
 
48
  # call model
 
49
 
50
  # get attention
51
 
 
2
  from urllib import request
3
 
4
  import torch
5
+ from Bio.PDB import PDBParser, Polypeptide, Structure
6
  from transformers import T5EncoderModel, T5Tokenizer
7
 
8
 
 
17
  structure = parser.get_structure(pdb_code, file)
18
  return structure
19
 
20
+ def get_sequences(structure: Structure) -> list[str]:
21
+ """
22
+ Get list of sequences with residues on a single letter format
23
+
24
+ Residues not in the standard 20 amino acids are replaced with X
25
+ """
26
+ sequences = []
27
+ for seq in structure.get_chains():
28
+ residues = [residue.get_resname() for residue in seq.get_residues()]
29
+ # TODO ask if using protein_letters_3to1_extended makes sense
30
+ residues_single_letter = map(lambda x: Polypeptide.protein_letters_3to1.get(x, "X"), residues)
31
+
32
+ sequences.append(list(residues_single_letter))
33
+ return sequences
34
 
35
  def get_protT5() -> tuple[T5Tokenizer, T5EncoderModel]:
36
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
55
  """
56
  # fetch structure
57
  structure = get_structure(pdb_code)
58
+ # Get list of sequences
59
+ sequences = get_sequences(structure)
60
 
61
  # get model
62
  tokenizer, model = get_protT5()
63
 
64
  # call model
65
+ ## Get sequence
66
 
67
  # get attention
68
 
tests/test_attention.py CHANGED
@@ -1,7 +1,7 @@
1
  from Bio.PDB.Structure import Structure
2
  from transformers import T5EncoderModel, T5Tokenizer
3
 
4
- from protention.attention import get_protT5, get_structure
5
 
6
 
7
  def test_get_structure():
@@ -11,6 +11,17 @@ def test_get_structure():
11
  assert structure is not None
12
  assert isinstance(structure, Structure)
13
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def test_get_protT5():
16
  result = get_protT5()
 
1
  from Bio.PDB.Structure import Structure
2
  from transformers import T5EncoderModel, T5Tokenizer
3
 
4
+ from protention.attention import get_protT5, get_sequences, get_structure
5
 
6
 
7
  def test_get_structure():
 
11
  assert structure is not None
12
  assert isinstance(structure, Structure)
13
 
14
+ def test_get_sequences():
15
+ pdb_id = "1AKE"
16
+ structure = get_structure(pdb_id)
17
+
18
+ sequences = get_sequences(structure)
19
+
20
+ assert sequences is not None
21
+ assert len(sequences) == 2
22
+
23
+ A, B = sequences
24
+ assert A[:3] == ["M", "R", "I"]
25
 
26
  def test_get_protT5():
27
  result = get_protT5()