AmelieSchreiber commited on
Commit
0a52d93
1 Parent(s): 19e5543

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +74 -0
README.md CHANGED
@@ -1,3 +1,77 @@
1
  ---
2
  license: mit
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ language:
4
+ - en
5
+ library_name: transformers
6
+ tags:
7
+ - ems
8
+ - esm2
9
+ - biology
10
+ - protein
11
+ - protein language model
12
+ - cafa 5
13
+ - protein function prediction
14
  ---
15
+
16
+
17
+ ## Using the model
18
+ First, downlowd the file `go-basic.obo` [from here](https://huggingface.co/datasets/AmelieSchreiber/cafa_5)
19
+ and store the file locally, then provide the local path in the the code below:
20
+
21
+ ```python
22
+ import torch
23
+ from transformers import AutoTokenizer, EsmForSequenceClassification
24
+ from sklearn.metrics import precision_recall_fscore_support
25
+
26
+ # 1. Parsing the go-basic.obo file
27
+ def parse_obo_file(file_path):
28
+ with open(file_path, 'r') as f:
29
+ data = f.read().split("[Term]")
30
+
31
+ terms = []
32
+ for entry in data[1:]:
33
+ lines = entry.strip().split("\n")
34
+ term = {}
35
+ for line in lines:
36
+ if line.startswith("id:"):
37
+ term["id"] = line.split("id:")[1].strip()
38
+ elif line.startswith("name:"):
39
+ term["name"] = line.split("name:")[1].strip()
40
+ elif line.startswith("namespace:"):
41
+ term["namespace"] = line.split("namespace:")[1].strip()
42
+ elif line.startswith("def:"):
43
+ term["definition"] = line.split("def:")[1].split('"')[1]
44
+ terms.append(term)
45
+ return terms
46
+
47
+ parsed_terms = parse_obo_file("go-basic.obo") # Replace `go-basic.obo` with your path
48
+
49
+ # 2. Load the saved model and tokenizer
50
+ model_path = "AmelieSchreiber/cafa_5_protein_function_prediction"
51
+ loaded_model = EsmForSequenceClassification.from_pretrained(model_path)
52
+ loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)
53
+
54
+ # 3. The predict_protein_function function
55
+ def predict_protein_function(sequence, model, tokenizer, go_terms):
56
+ inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
57
+ model.eval()
58
+ with torch.no_grad():
59
+ outputs = model(**inputs)
60
+ predictions = torch.sigmoid(outputs.logits)
61
+ predicted_indices = torch.where(predictions > 0.05)[1].tolist()
62
+
63
+ functions = []
64
+ for idx in predicted_indices:
65
+ term_id = unique_terms[idx] # Use the unique_terms list from your training script
66
+ for term in go_terms:
67
+ if term["id"] == term_id:
68
+ functions.append(term["name"])
69
+ break
70
+
71
+ return functions
72
+
73
+ # 4. Predicting protein function for an example sequence
74
+ example_sequence = "MAYLGSLVQRRLELASGDRLEASLGVGSELDVRGDRVKAVGSLDLEEGRLEQAGVSMA" # Replace with your protein sequence
75
+ predicted_functions = predict_protein_function(example_sequence, loaded_model, loaded_tokenizer, parsed_terms)
76
+ print(predicted_functions)
77
+ ```