Spaces:
Running
Running
Erva Ulusoy
commited on
Commit
·
988f84d
1
Parent(s):
42e2260
included protein and go term names in the result df
Browse files- data/name_info.json.gz +3 -0
- run_prothgt_app.py +21 -5
data/name_info.json.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89f020fe2ea8529d19f1fbe14f79b1fe111024ed3f26ae03b5b7f98e769d7eb5
|
3 |
+
size 3080084
|
run_prothgt_app.py
CHANGED
@@ -7,6 +7,8 @@ import os
|
|
7 |
from datasets import load_dataset
|
8 |
import gdown
|
9 |
import copy
|
|
|
|
|
10 |
|
11 |
class ProtHGT(torch.nn.Module):
|
12 |
def __init__(self, data,hidden_channels, num_heads, num_layers, mlp_hidden_layers, mlp_dropout):
|
@@ -64,9 +66,10 @@ def _load_data(heterodata, protein_ids, go_category):
|
|
64 |
|
65 |
return heterodata
|
66 |
|
67 |
-
def get_available_proteins(
|
68 |
-
with open(
|
69 |
-
|
|
|
70 |
|
71 |
def _generate_predictions(heterodata, model, target_type):
|
72 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
@@ -89,12 +92,18 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
|
|
89 |
'GO_term_C': 'Cellular Component'
|
90 |
}
|
91 |
|
|
|
|
|
|
|
|
|
92 |
# Get number of GO terms for this category
|
93 |
n_go_terms = len(heterodata[go_category]['id_mapping'])
|
94 |
|
95 |
# Create lists to store the data
|
96 |
all_proteins = []
|
|
|
97 |
all_go_terms = []
|
|
|
98 |
all_categories = []
|
99 |
all_probabilities = []
|
100 |
|
@@ -108,16 +117,23 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
|
|
108 |
end_idx = (i + 1) * n_go_terms
|
109 |
protein_predictions = predictions[start_idx:end_idx]
|
110 |
|
|
|
|
|
|
|
111 |
# Extend the lists
|
112 |
all_proteins.extend([protein_id] * n_go_terms)
|
|
|
113 |
all_go_terms.extend(go_terms)
|
|
|
114 |
all_categories.extend([go_category_dict[go_category]] * n_go_terms)
|
115 |
all_probabilities.extend(protein_predictions.tolist())
|
116 |
|
117 |
# Create DataFrame
|
118 |
prediction_df = pd.DataFrame({
|
119 |
-
'
|
120 |
-
'
|
|
|
|
|
121 |
'GO_category': all_categories,
|
122 |
'Probability': all_probabilities
|
123 |
})
|
|
|
7 |
from datasets import load_dataset
|
8 |
import gdown
|
9 |
import copy
|
10 |
+
import json
|
11 |
+
import gzip
|
12 |
|
13 |
class ProtHGT(torch.nn.Module):
|
14 |
def __init__(self, data,hidden_channels, num_heads, num_layers, mlp_hidden_layers, mlp_dropout):
|
|
|
66 |
|
67 |
return heterodata
|
68 |
|
69 |
+
def get_available_proteins(name_file='data/name_info.json.gz'):
|
70 |
+
with gzip.open(name_file, 'rt', encoding='utf-8') as file:
|
71 |
+
name_info = json.load(file)
|
72 |
+
return list(name_info['Protein'].keys())
|
73 |
|
74 |
def _generate_predictions(heterodata, model, target_type):
|
75 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
92 |
'GO_term_C': 'Cellular Component'
|
93 |
}
|
94 |
|
95 |
+
# Load name information from gzipped file
|
96 |
+
with gzip.open('data/name_info.json.gz', 'rt', encoding='utf-8') as file:
|
97 |
+
name_info = json.load(file)
|
98 |
+
|
99 |
# Get number of GO terms for this category
|
100 |
n_go_terms = len(heterodata[go_category]['id_mapping'])
|
101 |
|
102 |
# Create lists to store the data
|
103 |
all_proteins = []
|
104 |
+
all_protein_names = []
|
105 |
all_go_terms = []
|
106 |
+
all_go_term_names = []
|
107 |
all_categories = []
|
108 |
all_probabilities = []
|
109 |
|
|
|
117 |
end_idx = (i + 1) * n_go_terms
|
118 |
protein_predictions = predictions[start_idx:end_idx]
|
119 |
|
120 |
+
# Get protein name
|
121 |
+
protein_name = name_info['Protein'].get(protein_id, protein_id)
|
122 |
+
|
123 |
# Extend the lists
|
124 |
all_proteins.extend([protein_id] * n_go_terms)
|
125 |
+
all_protein_names.extend([protein_name] * n_go_terms)
|
126 |
all_go_terms.extend(go_terms)
|
127 |
+
all_go_term_names.extend([name_info['GO_term'].get(term_id, term_id) for term_id in go_terms])
|
128 |
all_categories.extend([go_category_dict[go_category]] * n_go_terms)
|
129 |
all_probabilities.extend(protein_predictions.tolist())
|
130 |
|
131 |
# Create DataFrame
|
132 |
prediction_df = pd.DataFrame({
|
133 |
+
'UniProt_ID': all_proteins,
|
134 |
+
'Protein': all_protein_names,
|
135 |
+
'GO_ID': all_go_terms,
|
136 |
+
'GO_term': all_go_term_names,
|
137 |
'GO_category': all_categories,
|
138 |
'Probability': all_probabilities
|
139 |
})
|