Erva Ulusoy commited on
Commit
c86e7b2
·
1 Parent(s): 8f00c3f

updated _create_prediction_df function

Browse files
Files changed (1) hide show
  1. run_prothgt_app.py +29 -18
run_prothgt_app.py CHANGED
@@ -88,30 +88,41 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
88
  'GO_term_P': 'Biological Process',
89
  'GO_term_C': 'Cellular Component'
90
  }
91
- # Create a list to store individual protein predictions
92
- all_predictions = []
93
-
94
- # Number of GO terms for this category
95
  n_go_terms = len(heterodata[go_category]['id_mapping'])
96
 
 
 
 
 
 
 
 
 
 
97
  # Process predictions for each protein
98
  for i, protein_id in enumerate(protein_ids):
99
- # Get the slice of predictions for this protein
100
- protein_predictions = predictions[i * n_go_terms:(i + 1) * n_go_terms]
 
 
101
 
102
- prediction_df = pd.DataFrame({
103
- 'Protein': protein_id,
104
- 'GO_category': go_category_dict[go_category],
105
- 'GO_term': list(heterodata[go_category]['id_mapping'].keys()),
106
- 'Probability': protein_predictions.tolist()
107
- })
108
- all_predictions.append(prediction_df)
109
 
110
- # Combine all predictions
111
- combined_df = pd.concat(all_predictions, ignore_index=True)
112
- combined_df.sort_values(by=['Protein', 'Probability'], ascending=[True, False], inplace=True)
113
- combined_df.reset_index(drop=True, inplace=True)
114
- return combined_df
 
 
 
 
115
 
116
  def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_category):
117
  all_predictions = []
 
88
  'GO_term_P': 'Biological Process',
89
  'GO_term_C': 'Cellular Component'
90
  }
91
+
92
+ # Get number of GO terms for this category
 
 
93
  n_go_terms = len(heterodata[go_category]['id_mapping'])
94
 
95
+ # Create lists to store the data
96
+ all_proteins = []
97
+ all_go_terms = []
98
+ all_categories = []
99
+ all_probabilities = []
100
+
101
+ # Get list of GO terms once
102
+ go_terms = list(heterodata[go_category]['id_mapping'].keys())
103
+
104
  # Process predictions for each protein
105
  for i, protein_id in enumerate(protein_ids):
106
+ # Get predictions for this protein
107
+ start_idx = i * n_go_terms
108
+ end_idx = (i + 1) * n_go_terms
109
+ protein_predictions = predictions[start_idx:end_idx]
110
 
111
+ # Extend the lists
112
+ all_proteins.extend([protein_id] * n_go_terms)
113
+ all_go_terms.extend(go_terms)
114
+ all_categories.extend([go_category_dict[go_category]] * n_go_terms)
115
+ all_probabilities.extend(protein_predictions.tolist())
 
 
116
 
117
+ # Create DataFrame
118
+ prediction_df = pd.DataFrame({
119
+ 'Protein': all_proteins,
120
+ 'GO_term': all_go_terms,
121
+ 'GO_category': all_categories,
122
+ 'Probability': all_probabilities
123
+ })
124
+
125
+ return prediction_df
126
 
127
  def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_category):
128
  all_predictions = []