Spaces:

ronig
/

protein_binding_search

Running

App Files Files Community

roni commited on Apr 4, 2023

Commit

33eb5d4

1 Parent(s): e1f535f

per gene aggregation

Browse files

Files changed (2) hide show

app.py +50 -22
protein_viz.py +1 -1

app.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import gradio as gr
 from get_index import get_engines
-from protein_viz import get_protein_name, render_html
 index_repo = "ronig/siamese_protein_index"
 model_repo = "ronig/protein_search_engine"
@@ -13,14 +16,17 @@ This application enables a quick protein-peptide binding search based on sequenc
 You can use it to search the full [PDB](https://www.rcsb.org/) database or in a specific organism genome.
 """
 max_results = 1000
-def search_and_display(seq, n_res, index_selection):
-    n_res = int(limit_n_results(n_res))
     engine = engines[index_selection]
-    search_res = engine.search_by_sequence(seq, n=n_res)
-    results_options = update_dropdown_menu(search_res)
-    formatted_search_results = format_search_results(search_res)
     return formatted_search_results, results_options
@@ -28,12 +34,42 @@ def limit_n_results(n):
     return max(min(n, max_results), 1)
-def update_dropdown_menu(search_res):
     choices = []
-    for row in search_res:
-        if "pdb_name" in row and "chain_id" in row:
-            choice = ".".join([row["pdb_name"], row["chain_id"]])
             choices.append(choice)
     if choices:
         update = gr.Dropdown.update(
             choices=choices, interactive=True, value=choices[0], visible=True
@@ -45,14 +81,6 @@ def update_dropdown_menu(search_res):
     return update
-def format_search_results(raw_search_results):
-    formatted_search_results = {}
-    for res in raw_search_results:
-        key, value = parse_pdb_search_result(res)
-        formatted_search_results[key] = value
-    return formatted_search_results
 def parse_pdb_search_result(raw_result):
     prot = raw_result["pdb_name"]
     chain = raw_result["chain_id"]
@@ -71,12 +99,12 @@ def switch_viz(new_choice):
         title_update = gr.Markdown.update(visible=False)
         description_update = gr.Markdown.update(value=None, visible=False)
     else:
-        choice_parts = new_choice.split(".")
-        pdb_id, chain = choice_parts[0], choice_parts[1]
         title_update = gr.Markdown.update(visible=True)
-        protein_name = get_protein_name(pdb_id)
-        new_value = f"""**PDB Title**: {protein_name}"""
         description_update = gr.Markdown.update(value=new_value, visible=True)
         html = render_html(pdb_id=pdb_id, chain=chain)

+import collections
+from typing import Dict, List
 import gradio as gr
 from get_index import get_engines
+from protein_viz import get_pdb_title, render_html
 index_repo = "ronig/siamese_protein_index"
 model_repo = "ronig/protein_search_engine"
 You can use it to search the full [PDB](https://www.rcsb.org/) database or in a specific organism genome.
 """
 max_results = 1000
+choice_sep = " | "
+def search_and_display(seq, max_res, index_selection):
+    n_search_res = 10000
+    max_res = int(limit_n_results(max_res))
     engine = engines[index_selection]
+    search_res = engine.search_by_sequence(seq, n=n_search_res)
+    agg_search_results = aggregate_search_results(search_res, max_res)
+    formatted_search_results = format_search_results(agg_search_results)
+    results_options = update_dropdown_menu(agg_search_results)
     return formatted_search_results, results_options
     return max(min(n, max_results), 1)
+def aggregate_search_results(raw_results: List[dict], max_res: int) -> Dict[str, dict]:
+    aggregated_by_gene = collections.defaultdict(list)
+    for raw_result in raw_results:
+        entry = select_keys(raw_result, ["pdb_name", "chain_id", "score", "organism"])
+        genes = raw_result["genes"]
+        if genes is not None:
+            gene_names = genes.split(" ")
+            for gene in gene_names:
+                aggregated_by_gene[gene].append(entry)
+                if len(aggregated_by_gene) >= max_res:
+                    return dict(aggregated_by_gene)
+    return dict(aggregated_by_gene)
+def select_keys(d: dict, keys: List[str]):
+    return {key: d[key] for key in keys}
+def format_search_results(agg_search_results):
+    formatted_search_results = {}
+    for gene, entries in agg_search_results.items():
+        entry = entries[0]
+        organism = entry["organism"]
+        score = entry["score"]
+        key = f"Gene: {gene} | Organism: {organism}"
+        formatted_search_results[key] = score
+    return formatted_search_results
+def update_dropdown_menu(agg_search_res):
     choices = []
+    for gene, entries in agg_search_res.items():
+        for entry in entries:
+            choice = choice_sep.join([gene, entry["pdb_name"], entry["chain_id"]])
             choices.append(choice)
     if choices:
         update = gr.Dropdown.update(
             choices=choices, interactive=True, value=choices[0], visible=True
     return update
 def parse_pdb_search_result(raw_result):
     prot = raw_result["pdb_name"]
     chain = raw_result["chain_id"]
         title_update = gr.Markdown.update(visible=False)
         description_update = gr.Markdown.update(value=None, visible=False)
     else:
+        choice_parts = new_choice.split(choice_sep)
+        pdb_id, chain = choice_parts[1:3]
         title_update = gr.Markdown.update(visible=True)
+        pdb_title = get_pdb_title(pdb_id)
+        new_value = f"""**PDB Title**: {pdb_title}"""
         description_update = gr.Markdown.update(value=new_value, visible=True)
         html = render_html(pdb_id=pdb_id, chain=chain)

protein_viz.py CHANGED Viewed

@@ -30,7 +30,7 @@ def render_html(pdb_id, chain):
     return iframe
-def get_protein_name(pdb_id: str):
     url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
     response = requests.get(url, timeout=1)
     if response.ok:

     return iframe
+def get_pdb_title(pdb_id: str):
     url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
     response = requests.get(url, timeout=1)
     if response.ok: