roni commited on
Commit
33eb5d4
1 Parent(s): e1f535f

per gene aggregation

Browse files
Files changed (2) hide show
  1. app.py +50 -22
  2. protein_viz.py +1 -1
app.py CHANGED
@@ -1,7 +1,10 @@
 
 
 
1
  import gradio as gr
2
 
3
  from get_index import get_engines
4
- from protein_viz import get_protein_name, render_html
5
 
6
  index_repo = "ronig/siamese_protein_index"
7
  model_repo = "ronig/protein_search_engine"
@@ -13,14 +16,17 @@ This application enables a quick protein-peptide binding search based on sequenc
13
  You can use it to search the full [PDB](https://www.rcsb.org/) database or in a specific organism genome.
14
  """
15
  max_results = 1000
 
16
 
17
 
18
- def search_and_display(seq, n_res, index_selection):
19
- n_res = int(limit_n_results(n_res))
 
20
  engine = engines[index_selection]
21
- search_res = engine.search_by_sequence(seq, n=n_res)
22
- results_options = update_dropdown_menu(search_res)
23
- formatted_search_results = format_search_results(search_res)
 
24
  return formatted_search_results, results_options
25
 
26
 
@@ -28,12 +34,42 @@ def limit_n_results(n):
28
  return max(min(n, max_results), 1)
29
 
30
 
31
- def update_dropdown_menu(search_res):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  choices = []
33
- for row in search_res:
34
- if "pdb_name" in row and "chain_id" in row:
35
- choice = ".".join([row["pdb_name"], row["chain_id"]])
36
  choices.append(choice)
 
37
  if choices:
38
  update = gr.Dropdown.update(
39
  choices=choices, interactive=True, value=choices[0], visible=True
@@ -45,14 +81,6 @@ def update_dropdown_menu(search_res):
45
  return update
46
 
47
 
48
- def format_search_results(raw_search_results):
49
- formatted_search_results = {}
50
- for res in raw_search_results:
51
- key, value = parse_pdb_search_result(res)
52
- formatted_search_results[key] = value
53
- return formatted_search_results
54
-
55
-
56
  def parse_pdb_search_result(raw_result):
57
  prot = raw_result["pdb_name"]
58
  chain = raw_result["chain_id"]
@@ -71,12 +99,12 @@ def switch_viz(new_choice):
71
  title_update = gr.Markdown.update(visible=False)
72
  description_update = gr.Markdown.update(value=None, visible=False)
73
  else:
74
- choice_parts = new_choice.split(".")
75
- pdb_id, chain = choice_parts[0], choice_parts[1]
76
  title_update = gr.Markdown.update(visible=True)
77
- protein_name = get_protein_name(pdb_id)
78
 
79
- new_value = f"""**PDB Title**: {protein_name}"""
80
 
81
  description_update = gr.Markdown.update(value=new_value, visible=True)
82
  html = render_html(pdb_id=pdb_id, chain=chain)
 
1
+ import collections
2
+ from typing import Dict, List
3
+
4
  import gradio as gr
5
 
6
  from get_index import get_engines
7
+ from protein_viz import get_pdb_title, render_html
8
 
9
  index_repo = "ronig/siamese_protein_index"
10
  model_repo = "ronig/protein_search_engine"
 
16
  You can use it to search the full [PDB](https://www.rcsb.org/) database or in a specific organism genome.
17
  """
18
  max_results = 1000
19
+ choice_sep = " | "
20
 
21
 
22
+ def search_and_display(seq, max_res, index_selection):
23
+ n_search_res = 10000
24
+ max_res = int(limit_n_results(max_res))
25
  engine = engines[index_selection]
26
+ search_res = engine.search_by_sequence(seq, n=n_search_res)
27
+ agg_search_results = aggregate_search_results(search_res, max_res)
28
+ formatted_search_results = format_search_results(agg_search_results)
29
+ results_options = update_dropdown_menu(agg_search_results)
30
  return formatted_search_results, results_options
31
 
32
 
 
34
  return max(min(n, max_results), 1)
35
 
36
 
37
+ def aggregate_search_results(raw_results: List[dict], max_res: int) -> Dict[str, dict]:
38
+ aggregated_by_gene = collections.defaultdict(list)
39
+ for raw_result in raw_results:
40
+ entry = select_keys(raw_result, ["pdb_name", "chain_id", "score", "organism"])
41
+ genes = raw_result["genes"]
42
+ if genes is not None:
43
+ gene_names = genes.split(" ")
44
+ for gene in gene_names:
45
+ aggregated_by_gene[gene].append(entry)
46
+ if len(aggregated_by_gene) >= max_res:
47
+ return dict(aggregated_by_gene)
48
+ return dict(aggregated_by_gene)
49
+
50
+
51
+ def select_keys(d: dict, keys: List[str]):
52
+ return {key: d[key] for key in keys}
53
+
54
+
55
+ def format_search_results(agg_search_results):
56
+ formatted_search_results = {}
57
+ for gene, entries in agg_search_results.items():
58
+ entry = entries[0]
59
+ organism = entry["organism"]
60
+ score = entry["score"]
61
+ key = f"Gene: {gene} | Organism: {organism}"
62
+ formatted_search_results[key] = score
63
+ return formatted_search_results
64
+
65
+
66
+ def update_dropdown_menu(agg_search_res):
67
  choices = []
68
+ for gene, entries in agg_search_res.items():
69
+ for entry in entries:
70
+ choice = choice_sep.join([gene, entry["pdb_name"], entry["chain_id"]])
71
  choices.append(choice)
72
+
73
  if choices:
74
  update = gr.Dropdown.update(
75
  choices=choices, interactive=True, value=choices[0], visible=True
 
81
  return update
82
 
83
 
 
 
 
 
 
 
 
 
84
  def parse_pdb_search_result(raw_result):
85
  prot = raw_result["pdb_name"]
86
  chain = raw_result["chain_id"]
 
99
  title_update = gr.Markdown.update(visible=False)
100
  description_update = gr.Markdown.update(value=None, visible=False)
101
  else:
102
+ choice_parts = new_choice.split(choice_sep)
103
+ pdb_id, chain = choice_parts[1:3]
104
  title_update = gr.Markdown.update(visible=True)
105
+ pdb_title = get_pdb_title(pdb_id)
106
 
107
+ new_value = f"""**PDB Title**: {pdb_title}"""
108
 
109
  description_update = gr.Markdown.update(value=new_value, visible=True)
110
  html = render_html(pdb_id=pdb_id, chain=chain)
protein_viz.py CHANGED
@@ -30,7 +30,7 @@ def render_html(pdb_id, chain):
30
  return iframe
31
 
32
 
33
- def get_protein_name(pdb_id: str):
34
  url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
35
  response = requests.get(url, timeout=1)
36
  if response.ok:
 
30
  return iframe
31
 
32
 
33
+ def get_pdb_title(pdb_id: str):
34
  url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
35
  response = requests.get(url, timeout=1)
36
  if response.ok: