Erva Ulusoy commited on
Commit
988f84d
·
1 Parent(s): 42e2260

included protein and go term names in the result df

Browse files
Files changed (2) hide show
  1. data/name_info.json.gz +3 -0
  2. run_prothgt_app.py +21 -5
data/name_info.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89f020fe2ea8529d19f1fbe14f79b1fe111024ed3f26ae03b5b7f98e769d7eb5
3
+ size 3080084
run_prothgt_app.py CHANGED
@@ -7,6 +7,8 @@ import os
7
  from datasets import load_dataset
8
  import gdown
9
  import copy
 
 
10
 
11
  class ProtHGT(torch.nn.Module):
12
  def __init__(self, data,hidden_channels, num_heads, num_layers, mlp_hidden_layers, mlp_dropout):
@@ -64,9 +66,10 @@ def _load_data(heterodata, protein_ids, go_category):
64
 
65
  return heterodata
66
 
67
- def get_available_proteins(protein_list_file='data/available_proteins.txt'):
68
- with open(protein_list_file, 'r') as file:
69
- return [line.strip() for line in file.readlines()]
 
70
 
71
  def _generate_predictions(heterodata, model, target_type):
72
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -89,12 +92,18 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
89
  'GO_term_C': 'Cellular Component'
90
  }
91
 
 
 
 
 
92
  # Get number of GO terms for this category
93
  n_go_terms = len(heterodata[go_category]['id_mapping'])
94
 
95
  # Create lists to store the data
96
  all_proteins = []
 
97
  all_go_terms = []
 
98
  all_categories = []
99
  all_probabilities = []
100
 
@@ -108,16 +117,23 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
108
  end_idx = (i + 1) * n_go_terms
109
  protein_predictions = predictions[start_idx:end_idx]
110
 
 
 
 
111
  # Extend the lists
112
  all_proteins.extend([protein_id] * n_go_terms)
 
113
  all_go_terms.extend(go_terms)
 
114
  all_categories.extend([go_category_dict[go_category]] * n_go_terms)
115
  all_probabilities.extend(protein_predictions.tolist())
116
 
117
  # Create DataFrame
118
  prediction_df = pd.DataFrame({
119
- 'Protein': all_proteins,
120
- 'GO_term': all_go_terms,
 
 
121
  'GO_category': all_categories,
122
  'Probability': all_probabilities
123
  })
 
7
  from datasets import load_dataset
8
  import gdown
9
  import copy
10
+ import json
11
+ import gzip
12
 
13
  class ProtHGT(torch.nn.Module):
14
  def __init__(self, data,hidden_channels, num_heads, num_layers, mlp_hidden_layers, mlp_dropout):
 
66
 
67
  return heterodata
68
 
69
+ def get_available_proteins(name_file='data/name_info.json.gz'):
70
+ with gzip.open(name_file, 'rt', encoding='utf-8') as file:
71
+ name_info = json.load(file)
72
+ return list(name_info['Protein'].keys())
73
 
74
  def _generate_predictions(heterodata, model, target_type):
75
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
92
  'GO_term_C': 'Cellular Component'
93
  }
94
 
95
+ # Load name information from gzipped file
96
+ with gzip.open('data/name_info.json.gz', 'rt', encoding='utf-8') as file:
97
+ name_info = json.load(file)
98
+
99
  # Get number of GO terms for this category
100
  n_go_terms = len(heterodata[go_category]['id_mapping'])
101
 
102
  # Create lists to store the data
103
  all_proteins = []
104
+ all_protein_names = []
105
  all_go_terms = []
106
+ all_go_term_names = []
107
  all_categories = []
108
  all_probabilities = []
109
 
 
117
  end_idx = (i + 1) * n_go_terms
118
  protein_predictions = predictions[start_idx:end_idx]
119
 
120
+ # Get protein name
121
+ protein_name = name_info['Protein'].get(protein_id, protein_id)
122
+
123
  # Extend the lists
124
  all_proteins.extend([protein_id] * n_go_terms)
125
+ all_protein_names.extend([protein_name] * n_go_terms)
126
  all_go_terms.extend(go_terms)
127
+ all_go_term_names.extend([name_info['GO_term'].get(term_id, term_id) for term_id in go_terms])
128
  all_categories.extend([go_category_dict[go_category]] * n_go_terms)
129
  all_probabilities.extend(protein_predictions.tolist())
130
 
131
  # Create DataFrame
132
  prediction_df = pd.DataFrame({
133
+ 'UniProt_ID': all_proteins,
134
+ 'Protein': all_protein_names,
135
+ 'GO_ID': all_go_terms,
136
+ 'GO_term': all_go_term_names,
137
  'GO_category': all_categories,
138
  'Probability': all_probabilities
139
  })