Spaces:

HUBioDataLab
/

ProtHGT

Running

App Files Files Community

Erva Ulusoy commited on 8 days ago

Commit

8f4b741

1 Parent(s): dbed3d3

added kg visualization feature

Browse files

Files changed (4) hide show

ProtHGT_app.py +288 -171
requirements.txt +2 -1
run_prothgt_app.py +3 -4
visualize_kg.py +242 -0

ProtHGT_app.py CHANGED Viewed

@@ -25,8 +25,8 @@ import random
 # # ❌ Remove the info message after initialization is complete
 # loading_placeholder.empty()
 from run_prothgt_app import *
 def convert_df(df):
    return df.to_csv(index=False).encode('utf-8')
@@ -34,19 +34,31 @@ def convert_df(df):
 # Initialize session state variables
 if 'predictions_df' not in st.session_state:
     st.session_state.predictions_df = None
 if 'submitted' not in st.session_state:
     st.session_state.submitted = False
 if 'previous_inputs' not in st.session_state:
     st.session_state.previous_inputs = None
-# Initialize session state variables
 if 'generating_predictions' not in st.session_state:
     st.session_state.generating_predictions = False
 def reset_prediction_state():
     st.session_state.generating_predictions = False
     st.session_state.submitted = False
     st.session_state.predictions_df = None
     st.session_state.previous_inputs = None
 def set_generating_predictions():
     st.session_state.generating_predictions = True
@@ -130,7 +142,6 @@ with st.sidebar:
         )
     elif selection_method == "Search proteins":
         # User enters search term
         search_query = st.text_input(
             "1\\. Start typing a protein ID (at least 3 characters) and press Enter to see search results in the dropdown menu below (2)",
@@ -138,6 +149,10 @@ with st.sidebar:
             disabled=disabled
         )
         # Apply fuzzy search only if query length is >= 3
         filtered_proteins = []
         if len(search_query) >= 3:
@@ -150,14 +165,22 @@ with st.sidebar:
             filtered_proteins = [match[0] for match in matches]  # Show top 50 matches
         with st.container():
             selected_proteins = st.multiselect(
                 "2\\. Select proteins from search results",
-                options=filtered_proteins,
                 placeholder="Start typing a protein ID above (1) to see search results...",
                 max_selections=100,
                 disabled=disabled,
                 key="protein_selector"
             )
             # Apply custom CSS to make container scrollable
             st.markdown("""
                 <style>
@@ -167,7 +190,7 @@ with st.sidebar:
                 }
                 </style>
                 """, unsafe_allow_html=True)
     else:  # Upload file option
         uploaded_file = st.file_uploader(
             "Upload a text file with UniProt IDs (one per line, max 100)*",
@@ -328,193 +351,287 @@ if st.session_state.submitted:
                 go_categories = ['GO_term_F', 'GO_term_P', 'GO_term_C']
             # Generate predictions
-            predictions_df = generate_prediction_df(
                 protein_ids=selected_proteins,
                 model_paths=model_paths,
                 model_config_paths=model_config_paths,
                 go_category=go_categories
             )
             st.session_state.predictions_df = predictions_df
             # Reset only the generating_predictions flag to release the sidebar
             st.session_state.generating_predictions = False
             st.rerun()
         # Display and filter predictions
         st.success("Predictions generated successfully!")
-        st.markdown("### Filter and View Predictions")
-        # Create filters
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            # Extract UniProt IDs from URLs for the selectbox
-            uniprot_ids = st.session_state.predictions_df['UniProt_ID'].apply(
-                lambda x: x.split('/')[-2]  # Gets the ID part from the URL
-            ).unique().tolist()
-            # Protein filter
-            selected_protein = st.selectbox(
-                "Filter by Protein",
-                options=['All'] + sorted(uniprot_ids)
-            )
-        with col2:
-            # GO category filter
-            selected_category = st.selectbox(
-                "Filter by GO Category",
-                options=['All'] + sorted(st.session_state.predictions_df['GO_category'].unique().tolist())
-            )
-        with col3:
-            # GO term filter
-            go_term_filter = st.text_input(
-                "Filter by GO Term ID",
-                placeholder="e.g., GO:0003674",
-                help="Enter a GO term ID to filter results"
-            ).strip()
-        with col4:
-            # Probability threshold
-            min_probability_threshold = st.slider(
-                "Minimum Probability",
-                min_value=0.0,
-                max_value=1.0,
-                value=0.5,
-                step=0.05
-            )
-            max_probability_threshold = st.slider(
-                "Maximum Probability",
-                min_value=0.0,
-                max_value=1.0,
-                value=1.0,
-                step=0.05
-            )
-        # Filter the dataframe using session state data
-        filtered_df = st.session_state.predictions_df.copy()
-        if selected_protein != 'All':
-            filtered_df = filtered_df[filtered_df['UniProt_ID'].str.contains(selected_protein)]
-        if selected_category != 'All':
-            filtered_df = filtered_df[filtered_df['GO_category'] == selected_category]
-        if go_term_filter:
-            filtered_df = filtered_df[filtered_df['GO_ID'].str.contains(go_term_filter, case=False, na=False)]
-        filtered_df = filtered_df[(filtered_df['Probability'] >= min_probability_threshold) &
-                                (filtered_df['Probability'] <= max_probability_threshold)]
-        # Custom CSS to increase table width and improve layout
-        st.markdown("""
-            <style>
-                .stDataFrame {
-                    width: 100%;
-                }
-                .stDataFrame > div {
-                    width: 100%;
-                }
-                .stDataFrame [data-testid="stDataFrameResizable"] {
-                    width: 100%;
-                    min-width: 100%;
                 }
-                .pagination-info {
-                    font-size: 14px;
-                    color: #666;
-                    padding: 10px 0;
-                }
-                .page-controls {
-                    display: flex;
-                    align-items: center;
-                    justify-content: center;
-                    gap: 20px;
-                    padding: 10px 0;
-                }
-            </style>
-        """, unsafe_allow_html=True)
-        # Add pagination controls
-        col1, col2, col3 = st.columns([2, 1, 2])
-        with col2:
-            rows_per_page = st.selectbox("Rows per page", [50, 100, 200, 500], index=1)
-        total_rows = len(filtered_df)
-        total_pages = (total_rows + rows_per_page - 1) // rows_per_page
-        # Initialize page number in session state
-        if "page_number" not in st.session_state:
-            st.session_state.page_number = 0
-        # Calculate start and end indices for current page
-        start_idx = st.session_state.page_number * rows_per_page
-        end_idx = min(start_idx + rows_per_page, total_rows)
-        st.dataframe(
-            filtered_df.iloc[start_idx:end_idx],
-            hide_index=True,
-            use_container_width=True,
-            column_config={
-                "UniProt_ID": st.column_config.LinkColumn(
-                    "UniProt ID",
-                    help="Click to view protein in UniProt",
-                    validate="^https://www\\.uniprot\\.org/uniprotkb/[A-Z0-9]+/entry$",
-                    display_text="^https://www\\.uniprot\\.org/uniprotkb/([A-Z0-9]+)/entry$"
-                ),
-                "GO_ID": st.column_config.LinkColumn(
-                    "GO ID",
-                    help="Click to view GO term in QuickGO",
-                    validate="^https://www\\.ebi\\.ac\\.uk/QuickGO/term/GO:[0-9]+$",
-                    display_text="^https://www\\.ebi\\.ac\\.uk/QuickGO/term/(GO:[0-9]+)$"
-                ),
-                "Probability": st.column_config.ProgressColumn(
-                    "Probability",
-                    format="%.2f",
-                    min_value=0,
-                    max_value=1,
-                ),
-                "Protein": st.column_config.TextColumn(
-                    "Protein",
-                    help="Protein Name",
-                ),
-                "GO_category": st.column_config.TextColumn(
-                    "GO Category",
-                    help="Gene Ontology Category",
-                ),
-                "GO_term": st.column_config.TextColumn(
-                    "GO Term",
-                    help="Gene Ontology Term Name",
-                ),
-            }
-        )
-        # Pagination controls with better layout
-        col1, col2, col3 = st.columns([1, 3, 1])
-        with col1:
-            if st.button("Previous", disabled=st.session_state.page_number == 0):
-                st.session_state.page_number -= 1
-                st.rerun()
-        with col2:
-            st.markdown(f"""
-                <div class="pagination-info" style="text-align: center">
-                    Page {st.session_state.page_number + 1} of {total_pages}<br>
-                    Showing rows {start_idx + 1} to {end_idx} of {total_rows}
-                </div>
-            """, unsafe_allow_html=True)
-        with col3:
-            if st.button("Next", disabled=st.session_state.page_number >= total_pages - 1):
-                st.session_state.page_number += 1
-                st.rerun()
-        # Download filtered results
-        st.download_button(
-            label="Download Filtered Results",
-            data=convert_df(filtered_df),
-            file_name="filtered_predictions.csv",
-            mime="text/csv",
-            key="download_filtered_predictions"
-        )

 # # ❌ Remove the info message after initialization is complete
 # loading_placeholder.empty()
 from run_prothgt_app import *
+from visualize_kg import *
 def convert_df(df):
    return df.to_csv(index=False).encode('utf-8')
 # Initialize session state variables
 if 'predictions_df' not in st.session_state:
     st.session_state.predictions_df = None
+if 'heterodata' not in st.session_state:
+    st.session_state.heterodata = None
 if 'submitted' not in st.session_state:
     st.session_state.submitted = False
 if 'previous_inputs' not in st.session_state:
     st.session_state.previous_inputs = None
 if 'generating_predictions' not in st.session_state:
     st.session_state.generating_predictions = False
+if 'protein_visualizations' not in st.session_state:
+    st.session_state.protein_visualizations = {}
 def reset_prediction_state():
     st.session_state.generating_predictions = False
     st.session_state.submitted = False
     st.session_state.predictions_df = None
     st.session_state.previous_inputs = None
+    # Clean up visualization files
+    if 'protein_visualizations' in st.session_state:
+        for viz_info in st.session_state.protein_visualizations.values():
+            try:
+                os.unlink(viz_info['path'])
+            except:
+                pass
+        st.session_state.protein_visualizations = {}
 def set_generating_predictions():
     st.session_state.generating_predictions = True
         )
     elif selection_method == "Search proteins":
         # User enters search term
         search_query = st.text_input(
             "1\\. Start typing a protein ID (at least 3 characters) and press Enter to see search results in the dropdown menu below (2)",
             disabled=disabled
         )
+        # Initialize selected_proteins in session state if not exists
+        if 'selected_proteins_search' not in st.session_state:
+            st.session_state.selected_proteins_search = []
         # Apply fuzzy search only if query length is >= 3
         filtered_proteins = []
         if len(search_query) >= 3:
             filtered_proteins = [match[0] for match in matches]  # Show top 50 matches
         with st.container():
+            # Include previously selected proteins in options
+            all_options = list(set(filtered_proteins + st.session_state.selected_proteins_search))
             selected_proteins = st.multiselect(
                 "2\\. Select proteins from search results",
+                options=all_options,
+                default=st.session_state.selected_proteins_search,
                 placeholder="Start typing a protein ID above (1) to see search results...",
                 max_selections=100,
                 disabled=disabled,
                 key="protein_selector"
             )
+            # Update session state with current selection
+            st.session_state.selected_proteins_search = selected_proteins
             # Apply custom CSS to make container scrollable
             st.markdown("""
                 <style>
                 }
                 </style>
                 """, unsafe_allow_html=True)
     else:  # Upload file option
         uploaded_file = st.file_uploader(
             "Upload a text file with UniProt IDs (one per line, max 100)*",
                 go_categories = ['GO_term_F', 'GO_term_P', 'GO_term_C']
             # Generate predictions
+            heterodata, predictions_df = generate_prediction_df(
                 protein_ids=selected_proteins,
                 model_paths=model_paths,
                 model_config_paths=model_config_paths,
                 go_category=go_categories
             )
+            st.session_state.heterodata = heterodata
             st.session_state.predictions_df = predictions_df
             # Reset only the generating_predictions flag to release the sidebar
             st.session_state.generating_predictions = False
             st.rerun()
         # Display and filter predictions
         st.success("Predictions generated successfully!")
+        # tabs for predictions and visualizations
+        predictions_tab, kg_viz_tab = st.tabs(["View Predictions", "View Knowledge Graphs"])
+        with predictions_tab:
+            st.markdown("### Filter and View Predictions")
+            # Create filters
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                # Extract UniProt IDs from URLs for the selectbox
+                uniprot_ids = st.session_state.predictions_df['UniProt_ID'].unique().tolist()
+                # Protein filter
+                selected_protein = st.selectbox(
+                    "Filter by Protein",
+                    options=['All'] + sorted(uniprot_ids)
+                )
+            with col2:
+                # GO category filter
+                selected_category = st.selectbox(
+                    "Filter by GO Category",
+                    options=['All'] + sorted(st.session_state.predictions_df['GO_category'].unique().tolist())
+                )
+            with col3:
+                # GO term filter
+                go_term_filter = st.text_input(
+                    "Filter by GO Term ID",
+                    placeholder="e.g., GO:0003674",
+                    help="Enter a GO term ID to filter results"
+                ).strip()
+            with col4:
+                # Probability threshold range slider
+                probability_range = st.slider(
+                    "Probability Range",
+                    min_value=0.0,
+                    max_value=1.0,
+                    value=(0.5, 1.0),  # (min, max) default values
+                    step=0.05
+                )
+                min_probability_threshold, max_probability_threshold = probability_range
+            # Filter the dataframe using session state data
+            filtered_df = st.session_state.predictions_df.copy()
+            if selected_protein != 'All':
+                filtered_df = filtered_df[filtered_df['UniProt_ID'].str.contains(selected_protein)]
+            if selected_category != 'All':
+                filtered_df = filtered_df[filtered_df['GO_category'] == selected_category]
+            if go_term_filter:
+                filtered_df = filtered_df[filtered_df['GO_ID'] == go_term_filter]
+            filtered_df = filtered_df[(filtered_df['Probability'] >= min_probability_threshold) &
+                                    (filtered_df['Probability'] <= max_probability_threshold)]
+            filtered_df['UniProt_ID'] = [f"https://www.uniprot.org/uniprotkb/{pid}/entry" for pid in filtered_df['UniProt_ID']]
+            filtered_df['GO_ID'] = [f"https://www.ebi.ac.uk/QuickGO/term/{go_id}" for go_id in filtered_df['GO_ID']]
+            # Custom CSS to increase table width and improve layout
+            st.markdown("""
+                <style>
+                    .stDataFrame {
+                        width: 100%;
+                    }
+                    .stDataFrame > div {
+                        width: 100%;
+                    }
+                    .stDataFrame [data-testid="stDataFrameResizable"] {
+                        width: 100%;
+                        min-width: 100%;
+                    }
+                    .pagination-info {
+                        font-size: 14px;
+                        color: #666;
+                        padding: 10px 0;
+                    }
+                    .page-controls {
+                        display: flex;
+                        align-items: center;
+                        justify-content: center;
+                        gap: 20px;
+                        padding: 10px 0;
+                    }
+                </style>
+            """, unsafe_allow_html=True)
+            # Add pagination controls
+            col1, col2, col3 = st.columns([2, 1, 2])
+            with col2:
+                rows_per_page = st.selectbox("Rows per page", [50, 100, 200, 500], index=1)
+            total_rows = len(filtered_df)
+            total_pages = (total_rows + rows_per_page - 1) // rows_per_page
+            # Initialize page number in session state
+            if "page_number" not in st.session_state:
+                st.session_state.page_number = 0
+            # Calculate start and end indices for current page
+            start_idx = st.session_state.page_number * rows_per_page
+            end_idx = min(start_idx + rows_per_page, total_rows)
+            st.dataframe(
+                filtered_df.iloc[start_idx:end_idx],
+                hide_index=True,
+                use_container_width=True,
+                column_config={
+                    "UniProt_ID": st.column_config.LinkColumn(
+                        "UniProt ID",
+                        help="Click to view protein in UniProt",
+                        validate="^https://www\\.uniprot\\.org/uniprotkb/[A-Z0-9]+/entry$",
+                        display_text="^https://www\\.uniprot\\.org/uniprotkb/([A-Z0-9]+)/entry$"
+                    ),
+                    "GO_ID": st.column_config.LinkColumn(
+                        "GO ID",
+                        help="Click to view GO term in QuickGO",
+                        validate="^https://www\\.ebi\\.ac\\.uk/QuickGO/term/GO:[0-9]+$",
+                        display_text="^https://www\\.ebi\\.ac\\.uk/QuickGO/term/(GO:[0-9]+)$"
+                    ),
+                    "Probability": st.column_config.ProgressColumn(
+                        "Probability",
+                        format="%.2f",
+                        min_value=0,
+                        max_value=1,
+                    ),
+                    "Protein": st.column_config.TextColumn(
+                        "Protein",
+                        help="Protein Name",
+                    ),
+                    "GO_category": st.column_config.TextColumn(
+                        "GO Category",
+                        help="Gene Ontology Category",
+                    ),
+                    "GO_term": st.column_config.TextColumn(
+                        "GO Term",
+                        help="Gene Ontology Term Name",
+                    ),
                 }
+            )
+            # Pagination controls with better layout
+            col1, col2, col3 = st.columns([1, 3, 1])
+            with col1:
+                if st.button("Previous", disabled=st.session_state.page_number == 0):
+                    st.session_state.page_number -= 1
+                    st.rerun()
+            with col2:
+                st.markdown(f"""
+                    <div class="pagination-info" style="text-align: center">
+                        Page {st.session_state.page_number + 1} of {total_pages}<br>
+                        Showing rows {start_idx + 1} to {end_idx} of {total_rows}
+                    </div>
+                """, unsafe_allow_html=True)
+            with col3:
+                if st.button("Next", disabled=st.session_state.page_number >= total_pages - 1):
+                    st.session_state.page_number += 1
+                    st.rerun()
+            downloadable_df = filtered_df.copy()
+            downloadable_df['UniProt_ID'] = downloadable_df['UniProt_ID'].apply(
+                lambda x: x.split('/')[-2]  # Gets the ID part from the URL
+            )
+            downloadable_df['GO_ID'] = downloadable_df['GO_ID'].apply(
+                lambda x: x.split('/')[-1]  # Gets the ID part from the URL
+            )
+            # Download filtered results
+            st.download_button(
+                label="Download Filtered Results",
+                data=convert_df(downloadable_df),
+                file_name="filtered_predictions.csv",
+                mime="text/csv",
+                key="download_filtered_predictions"
+            )
+        with kg_viz_tab:
+            st.markdown("### Knowledge Graph Visualization")
+            if not selected_proteins:
+                st.info("Please select proteins from the sidebar to visualize their knowledge graphs.")
+            elif len(selected_proteins) <= 10:
+                st.text("Visualize the knowledge graph for each protein to understand the biological relationships that contributed to the predictions.")
+                protein_tabs = st.tabs([f"{protein_id}" for protein_id in selected_proteins])
+                # Create visualizations in each tab
+                for idx, protein_id in enumerate(selected_proteins):
+                    with protein_tabs[idx]:
+                        max_node_count = st.slider(
+                            "Maximum neighbors per edge type",
+                            min_value=5,
+                            max_value=50,
+                            value=10,
+                            step=5,
+                            help="Control the maximum number of neighboring nodes shown for each relationship type",
+                            key=f"slider_{protein_id}"
+                        )
+                        # Check if visualization exists for this protein
+                        viz_exists = (protein_id in st.session_state.protein_visualizations and
+                                    os.path.exists(st.session_state.protein_visualizations[protein_id]['path']))
+                        if not viz_exists:
+                            if st.button(f"Generate Visualization", key=f"viz_{protein_id}"):
+                                # Generate visualization with selected max_node_count
+                                html_path, visualized_edges = visualize_protein_subgraph(
+                                    st.session_state.heterodata,
+                                    protein_id,
+                                    st.session_state.predictions_df,
+                                    limit=max_node_count
+                                )
+                                # Store visualization info in session state
+                                st.session_state.protein_visualizations[protein_id] = {
+                                    'path': html_path,
+                                    'edges': visualized_edges
+                                }
+                                st.rerun()
+                        # If visualization exists, display it
+                        if viz_exists:
+                            viz_info = st.session_state.protein_visualizations[protein_id]
+                            # Add download button for edges
+                            formatted_edges = {}
+                            for edge_type, edges in viz_info['edges'].items():
+                                edge_type_str = f"{edge_type[0]}_{edge_type[1]}_{edge_type[2]}"
+                                formatted_edges[edge_type_str] = [
+                                    {"source": edge[0][0], "target": edge[0][1], "probability": edge[1]}
+                                    for edge in edges
+                                ]
+                            kg_viz_button_columns = st.columns([1, 1, 1])
+                            with kg_viz_button_columns[0]:
+                                st.download_button(
+                                    label='Download Visualized Edges',
+                                    data=json.dumps(formatted_edges, indent=2),
+                                    file_name=f'{protein_id}_visualized_edges.json',
+                                    mime='application/json'
+                                )
+                            with kg_viz_button_columns[1]:
+                                if st.button("Regenerate Visualization", key=f"regenerate_{protein_id}"):
+                                    # Clean up old file
+                                    try:
+                                        os.unlink(viz_info['path'])
+                                    except FileNotFoundError:
+                                        pass
+                                    # Remove from session state
+                                    del st.session_state.protein_visualizations[protein_id]
+                                    st.rerun()
+                            with open(viz_info['path'], 'r', encoding='utf-8') as f:
+                                html_content = f.read()
+                            st.components.v1.html(html_content, height=600)
+            else:
+                st.warning("Knowledge graph visualization is only available when 10 or fewer proteins are selected.")

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ torch_sparse==0.6.15
 torch_scatter==2.1.0
 torch_geometric==2.2.0
 gdown
-rapidfuzz

 torch_scatter==2.1.0
 torch_geometric==2.2.0
 gdown
+rapidfuzz
+pyvis

run_prothgt_app.py CHANGED Viewed

@@ -130,9 +130,9 @@ def _create_prediction_df(predictions, heterodata, protein_ids, go_category):
     # Create DataFrame
     prediction_df = pd.DataFrame({
-        'UniProt_ID': [f"https://www.uniprot.org/uniprotkb/{pid}/entry" for pid in all_proteins],
         'Protein': all_protein_names,
-        'GO_ID': [f"https://www.ebi.ac.uk/QuickGO/term/{go_id}" for go_id in all_go_terms],
         'GO_term': all_go_term_names,
         'GO_category': all_categories,
         'Probability': all_probabilities
@@ -204,7 +204,6 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
         del predictions
         torch.cuda.empty_cache()  # Clear CUDA cache if using GPU
-    del heterodata
     # Combine all predictions
     final_df = pd.concat(all_predictions, ignore_index=True)
@@ -213,4 +212,4 @@ def generate_prediction_df(protein_ids, model_paths, model_config_paths, go_cate
     del all_predictions
     torch.cuda.empty_cache()
-    return final_df

     # Create DataFrame
     prediction_df = pd.DataFrame({
+        'UniProt_ID': all_proteins,
         'Protein': all_protein_names,
+        'GO_ID': all_go_terms,
         'GO_term': all_go_term_names,
         'GO_category': all_categories,
         'Probability': all_probabilities
         del predictions
         torch.cuda.empty_cache()  # Clear CUDA cache if using GPU
     # Combine all predictions
     final_df = pd.concat(all_predictions, ignore_index=True)
     del all_predictions
     torch.cuda.empty_cache()
+    return heterodata, final_df

visualize_kg.py ADDED Viewed

	@@ -0,0 +1,242 @@

+from pyvis.network import Network
+import os
+NODE_TYPE_COLORS = {
+ 'Disease': '#079dbb',
+ 'HPO': '#58d0e8',
+ 'Drug': '#815ac0',
+ 'Compound': '#d2b7e5',
+ 'Domain': '#6bbf59',
+ 'GO_term_P': '#ff8800',
+ 'GO_term_F': '#ffaa00',
+ 'GO_term_C': '#ffc300',
+ 'Pathway': '#720026',
+ 'kegg_Pathway': '#720026',
+ 'EC_number': '#ce4257',
+ 'Protein': '#3aa6a4'
+}
+GO_CATEGORY_MAPPING = {
+    'Biological Process': 'GO_term_P',
+    'Molecular Function': 'GO_term_F',
+    'Cellular Component': 'GO_term_C'
+}
+def _gather_protein_edges(data, protein_id):
+    protein_idx = data['Protein']['id_mapping'][protein_id]
+    reverse_id_mapping = {}
+    for node_type in data.node_types:
+        reverse_id_mapping[node_type] = {v:k for k, v in data[node_type]['id_mapping'].items()}
+    protein_edges = {}
+    print(f'Gathering edges for {protein_id}...')
+    for edge_type in data.edge_types:
+        if 'rev' not in edge_type[1]:
+            if edge_type not in protein_edges:
+                protein_edges[edge_type] = []
+            if edge_type[0] == 'Protein':
+                print(f'Gathering edges for {edge_type}...')
+                # append the edges with protein_idx as source node
+                edges = data[edge_type].edge_index[:, data[edge_type].edge_index[0] == protein_idx]
+                protein_edges[edge_type].extend(edges.T.tolist())
+            elif edge_type[2] == 'Protein':
+                print(f'Gathering edges for {edge_type}...')
+                # append the edges with protein_idx as target node
+                edges = data[edge_type].edge_index[:, data[edge_type].edge_index[1] == protein_idx]
+                protein_edges[edge_type].extend(edges.T.tolist())
+    for edge_type in protein_edges.keys():
+        if protein_edges[edge_type]:
+            mapped_edges = set()
+            for edge in protein_edges[edge_type]:
+                # Get source and target node types from edge_type
+                source_type, _, target_type = edge_type
+                # Map indices back to original IDs
+                source_id = reverse_id_mapping[source_type][edge[0]]
+                target_id = reverse_id_mapping[target_type][edge[1]]
+                mapped_edges.add((source_id, target_id))
+            protein_edges[edge_type] = mapped_edges
+    return protein_edges
+def _filter_edges(protein_id, protein_edges, prediction_df, limit=10):
+    filtered_edges = {}
+    prediction_categories = prediction_df['GO_category'].unique()
+    prediction_categories = [GO_CATEGORY_MAPPING[category] for category in prediction_categories]
+    go_category_reverse_mapping = {v:k for k, v in GO_CATEGORY_MAPPING.items()}
+    for edge_type, edges in protein_edges.items():
+        # Skip if edges is empty
+        if edges is None or len(edges) == 0:
+            continue
+        if edge_type[2] in prediction_categories:
+            category_mask = (prediction_df['GO_category'] == go_category_reverse_mapping[edge_type[2]]) & (prediction_df['UniProt_ID'] == protein_id)
+            category_predictions = prediction_df[category_mask]
+            if len(category_predictions) > 0:
+                category_predictions = category_predictions.sort_values(by='Probability', ascending=False)
+                # Convert set to list for easier filtering
+                edges_list = list(edges)
+                # Filter valid edges and store with probabilities
+                valid_edges = []
+                for _, row in category_predictions.iterrows():
+                    term = row['GO_ID']
+                    prob = row['Probability']
+                    matching_edges = [(edge, prob) for edge in edges_list if edge[1] == term]
+                    valid_edges.extend(matching_edges)
+                    if len(valid_edges) >= limit:
+                        break
+                filtered_edges[edge_type] = valid_edges  # Remove set conversion to preserve probabilities
+            else:
+                # If no predictions, include all edges up to limit without probabilities
+                filtered_edges[edge_type] = [(edge, None) for edge in list(edges)[:limit]]
+        else:
+            # For non-GO edges, include all edges up to limit without probabilities
+            filtered_edges[edge_type] = [(edge, None) for edge in list(edges)[:limit]]
+    return filtered_edges
+def visualize_protein_subgraph(data, protein_id, prediction_df, limit=10):
+    protein_edges = _gather_protein_edges(data, protein_id)
+    visualized_edges = _filter_edges(protein_id, protein_edges, prediction_df, limit)
+    print(f'Edges to be visualized: {visualized_edges}')
+    net = Network(height="600px", width="100%", directed=True, notebook=False)
+    # Create groups configuration from NODE_TYPE_COLORS
+    groups_config = {}
+    for node_type, color in NODE_TYPE_COLORS.items():
+            groups_config[node_type] = {
+                "color": {"background": color, "border": color}
+            }
+    # Convert groups_config to a JSON-compatible string
+    import json
+    groups_json = json.dumps(groups_config)
+    # Configure physics options with settings for better clustering
+    net.set_options("""{
+        "physics": {
+            "enabled": true,
+            "barnesHut": {
+                "gravitationalConstant": -1000,
+                "springLength": 250,
+                "springConstant": 0.001,
+                "damping": 0.09,
+                "avoidOverlap": 0
+            },
+            "forceAtlas2Based": {
+                "gravitationalConstant": -50,
+                "centralGravity": 0.01,
+                "springLength": 100,
+                "springConstant": 0.08,
+                "damping": 0.4,
+                "avoidOverlap": 0
+            },
+            "solver": "barnesHut",
+            "stabilization": {
+                "enabled": true,
+                "iterations": 1000,
+                "updateInterval": 25
+            }
+        },
+        "layout": {
+            "improvedLayout": true,
+            "hierarchical": {
+                "enabled": false
+            }
+        },
+        "interaction": {
+            "hover": true,
+            "navigationButtons": true,
+            "multiselect": true
+        },
+        "configure": {
+            "enabled": true,
+            "filter": ["physics", "layout", "manipulation"],
+            "showButton": true
+        },
+        "groups": """ + groups_json + "}")
+    # Add the main protein node
+    net.add_node(protein_id,
+                 label=f"Protein: {protein_id}",
+                 color={'background': 'white', 'border': '#c1121f'},
+                 borderWidth=4,
+                 shape="dot",
+                 font={'color': '#000000', 'size': 15},
+                 group='Protein',
+                 size=30,
+                 mass=2.5)
+    # Track added nodes to avoid duplication
+    added_nodes = {protein_id}
+    # Add edges and target nodes
+    for edge_type, edges in visualized_edges.items():
+        source_type, relation_type, target_type = edge_type
+        for edge_info in edges:
+            edge, probability = edge_info
+            source, target = edge[0], edge[1]
+            source_str = str(source)
+            target_str = str(target)
+            # Add source node if not present
+            if source_str not in added_nodes:
+                net.add_node(source_str,
+                           label=f"{source_str}",
+                           shape="dot",
+                           font={'color': '#000000', 'size': 12},
+                           title=f"{source_type}: {source_str}",
+                           group=source_type,
+                           size=15,
+                           mass=1.5)
+                added_nodes.add(source_str)
+            # Add target node if not present
+            if target_str not in added_nodes:
+                net.add_node(target_str,
+                           label=f"{target_str}",
+                           shape="dot",
+                           font={'color': '#000000', 'size': 12},
+                           title=f"{target_type}: {target_str}",
+                           group=target_type,
+                           size=15,
+                           mass=1.5)
+                added_nodes.add(target_str)
+            # Add edge with relationship type and probability as label
+            edge_label = f"{relation_type}"
+            if probability is not None:
+                edge_label += f"(P={probability:.2f})"
+                net.add_edge(source_str, target_str,
+                        label=edge_label,
+                        color='#666666',
+                        title=edge_label,
+                        length=200,
+                        smooth={'type': 'curvedCW', 'roundness': 0.1})
+            else:
+                net.add_edge(source_str, target_str,
+                        label=edge_label,
+                        font={'size': 0},
+                        color='#666666',
+                        title=edge_label,
+                        length=200,
+                        smooth={'type': 'curvedCW', 'roundness': 0.1})
+    # Save graph to a protein-specific file in a temporary directory
+    os.makedirs('temp_viz', exist_ok=True)
+    file_path = os.path.join('temp_viz', f'{protein_id}_graph.html')
+    net.save_graph(file_path)
+    return file_path, visualized_edges