Fix code example
Browse files
README.md
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
---
|
2 |
-
language: protein
|
3 |
tags:
|
4 |
- protein language model
|
5 |
datasets:
|
@@ -38,29 +37,29 @@ We have also noticed that for feature extraction, its better to use the feature
|
|
38 |
Here is how to use this model to extract the features of a given protein sequence in PyTorch:
|
39 |
|
40 |
```python
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
|
45 |
-
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
sequences_Example = [re.sub(r"[UZOB]", "X", sequence) for sequence in sequences_Example]
|
52 |
-
|
53 |
-
ids = tokenizer.batch_encode_plus(sequences_Example, add_special_tokens=True, padding=True)
|
54 |
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
57 |
|
58 |
-
|
59 |
-
|
60 |
|
61 |
-
|
62 |
-
encoder_embedding = embedding[2].cpu().numpy()
|
63 |
-
decoder_embedding = embedding[0].cpu().numpy()
|
64 |
```
|
65 |
|
66 |
## Training data
|
|
|
1 |
---
|
|
|
2 |
tags:
|
3 |
- protein language model
|
4 |
datasets:
|
|
|
37 |
Here is how to use this model to extract the features of a given protein sequence in PyTorch:
|
38 |
|
39 |
```python
|
40 |
+
sequence_examples = ["PRTEINO", "SEQWENCE"]
|
41 |
+
# this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
|
42 |
+
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
|
43 |
|
44 |
+
# tokenize sequences and pad up to the longest sequence in the batch
|
45 |
+
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
|
46 |
+
input_ids = torch.tensor(ids['input_ids']).to(device)
|
47 |
+
attention_mask = torch.tensor(ids['attention_mask']).to(device)
|
48 |
|
49 |
+
# generate embeddings
|
50 |
+
with torch.no_grad():
|
51 |
+
embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
# extract embeddings for the first ([0,:]) sequence in the batch while removing padded & special tokens ([0,:7])
|
54 |
+
emb_0 = embedding_repr.last_hidden_state[0,:7] # shape (7 x 1024)
|
55 |
+
print(f"Shape of per-residue embedding of first sequences: {emb_0.shape}")
|
56 |
+
# do the same for the second ([1,:]) sequence in the batch while taking into account different sequence lengths ([1,:8])
|
57 |
+
emb_1 = embedding_repr.last_hidden_state[1,:8] # shape (8 x 1024)
|
58 |
|
59 |
+
# if you want to derive a single representation (per-protein embedding) for the whole protein
|
60 |
+
emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
|
61 |
|
62 |
+
print(f"Shape of per-protein embedding of first sequences: {emb_0_per_protein.shape}")
|
|
|
|
|
63 |
```
|
64 |
|
65 |
## Training data
|