Spaces:

HUBioDataLab
/

ProtHGT

Running

App Files Files Community

Erva Ulusoy commited on Feb 13

Commit

988f84d

1 Parent(s): 42e2260

included protein and go term names in the result df

Browse files

Files changed (2) hide show

data/name_info.json.gz +3 -0
run_prothgt_app.py +21 -5

data/name_info.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89f020fe2ea8529d19f1fbe14f79b1fe111024ed3f26ae03b5b7f98e769d7eb5
+size 3080084

run_prothgt_app.py CHANGED Viewed

@@ -7,6 +7,8 @@ import os
 from datasets import load_dataset
 import gdown
 import copy
 class ProtHGT(torch.nn.Module):
     def __init__(self, data,hidden_channels, num_heads, num_layers, mlp_hidden_layers, mlp_dropout):
@@ -64,9 +66,10 @@ def _load_data(heterodata, protein_ids, go_category):
     return heterodata
-def get_available_proteins(protein_list_file='data/available_proteins.txt'):
-    with open(protein_list_file, 'r') as file:
-        return [line.strip() for line in file.readlines()]
 def _generate_predictions(heterodata, model, target_type):
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -89,12 +92,18 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
         'GO_term_C': 'Cellular Component'
     }
     # Get number of GO terms for this category
     n_go_terms = len(heterodata[go_category]['id_mapping'])
     # Create lists to store the data
     all_proteins = []
     all_go_terms = []
     all_categories = []
     all_probabilities = []
@@ -108,16 +117,23 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
         end_idx = (i + 1) * n_go_terms
         protein_predictions = predictions[start_idx:end_idx]
         # Extend the lists
         all_proteins.extend([protein_id] * n_go_terms)
         all_go_terms.extend(go_terms)
         all_categories.extend([go_category_dict[go_category]] * n_go_terms)
         all_probabilities.extend(protein_predictions.tolist())
     # Create DataFrame
     prediction_df = pd.DataFrame({
-        'Protein': all_proteins,
-        'GO_term': all_go_terms,
         'GO_category': all_categories,
         'Probability': all_probabilities
     })

 from datasets import load_dataset
 import gdown
 import copy
+import json
+import gzip
 class ProtHGT(torch.nn.Module):
     def __init__(self, data,hidden_channels, num_heads, num_layers, mlp_hidden_layers, mlp_dropout):
     return heterodata
+def get_available_proteins(name_file='data/name_info.json.gz'):
+    with gzip.open(name_file, 'rt', encoding='utf-8') as file:
+        name_info = json.load(file)
+    return list(name_info['Protein'].keys())
 def _generate_predictions(heterodata, model, target_type):
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         'GO_term_C': 'Cellular Component'
     }
+    # Load name information from gzipped file
+    with gzip.open('data/name_info.json.gz', 'rt', encoding='utf-8') as file:
+        name_info = json.load(file)
     # Get number of GO terms for this category
     n_go_terms = len(heterodata[go_category]['id_mapping'])
     # Create lists to store the data
     all_proteins = []
+    all_protein_names = []
     all_go_terms = []
+    all_go_term_names = []
     all_categories = []
     all_probabilities = []
         end_idx = (i + 1) * n_go_terms
         protein_predictions = predictions[start_idx:end_idx]
+        # Get protein name
+        protein_name = name_info['Protein'].get(protein_id, protein_id)
         # Extend the lists
         all_proteins.extend([protein_id] * n_go_terms)
+        all_protein_names.extend([protein_name] * n_go_terms)
         all_go_terms.extend(go_terms)
+        all_go_term_names.extend([name_info['GO_term'].get(term_id, term_id) for term_id in go_terms])
         all_categories.extend([go_category_dict[go_category]] * n_go_terms)
         all_probabilities.extend(protein_predictions.tolist())
     # Create DataFrame
     prediction_df = pd.DataFrame({
+        'UniProt_ID': all_proteins,
+        'Protein': all_protein_names,
+        'GO_ID': all_go_terms,
+        'GO_term': all_go_term_names,
         'GO_category': all_categories,
         'Probability': all_probabilities
     })