roni commited on
Commit
6f068fd
·
1 Parent(s): 9ec67f0

deduplicating on uniprot id

Browse files
Files changed (2) hide show
  1. app.py +21 -15
  2. get_index.py +2 -1
app.py CHANGED
@@ -42,20 +42,19 @@ def limit_n_results(n):
42
 
43
 
44
  def aggregate_search_results(raw_results: List[dict], max_res: int) -> Dict[str, dict]:
45
- aggregated_by_gene = collections.defaultdict(list)
46
  for raw_result in raw_results:
47
  entry = select_keys(
48
  raw_result,
49
- keys=["pdb_name", "chain_id", "score", "organism", "uniprot_id"]
50
  )
51
- genes = raw_result["genes"]
52
- if genes is not None:
53
- gene_names = genes.split(" ")
54
- for gene in gene_names:
55
- aggregated_by_gene[gene].append(entry)
56
- if len(aggregated_by_gene) >= max_res:
57
- return dict(aggregated_by_gene)
58
- return dict(aggregated_by_gene)
59
 
60
 
61
  def select_keys(d: dict, keys: List[str]):
@@ -64,21 +63,28 @@ def select_keys(d: dict, keys: List[str]):
64
 
65
  def format_search_results(agg_search_results):
66
  formatted_search_results = {}
67
- for gene, entries in agg_search_results.items():
68
  entry = entries[0]
69
  organism = entry["organism"]
70
  score = entry["score"]
71
- uniprot_id = entry["uniprot_id"]
72
- key = f"Gene: {gene} | Organism: {organism} | Uniprot ID: {uniprot_id}"
73
  formatted_search_results[key] = score
74
  return formatted_search_results
75
 
76
 
77
  def update_dropdown_menu(agg_search_res):
78
  choices = []
79
- for gene, entries in agg_search_res.items():
80
  for entry in entries:
81
- choice = choice_sep.join([gene, entry["pdb_name"], entry["chain_id"]])
 
 
 
 
 
 
 
82
  choices.append(choice)
83
 
84
  if choices:
 
42
 
43
 
44
  def aggregate_search_results(raw_results: List[dict], max_res: int) -> Dict[str, dict]:
45
+ aggregated_by_uniprot = collections.defaultdict(list)
46
  for raw_result in raw_results:
47
  entry = select_keys(
48
  raw_result,
49
+ keys=["pdb_name", "chain_id", "score", "organism", "uniprot_id", "genes"],
50
  )
51
+ uniprot_id = raw_result["uniprot_id"]
52
+
53
+ if uniprot_id is not None:
54
+ aggregated_by_uniprot[uniprot_id].append(entry)
55
+ if len(aggregated_by_uniprot) >= max_res:
56
+ return dict(aggregated_by_uniprot)
57
+ return dict(aggregated_by_uniprot)
 
58
 
59
 
60
  def select_keys(d: dict, keys: List[str]):
 
63
 
64
  def format_search_results(agg_search_results):
65
  formatted_search_results = {}
66
+ for uniprot_id, entries in agg_search_results.items():
67
  entry = entries[0]
68
  organism = entry["organism"]
69
  score = entry["score"]
70
+ genes = entry["genes"]
71
+ key = f"Uniprot ID: {uniprot_id} | Organism: {organism} | Gene Names: {genes}"
72
  formatted_search_results[key] = score
73
  return formatted_search_results
74
 
75
 
76
  def update_dropdown_menu(agg_search_res):
77
  choices = []
78
+ for uniprot_id, entries in agg_search_res.items():
79
  for entry in entries:
80
+ choice = choice_sep.join(
81
+ [
82
+ uniprot_id,
83
+ entry["pdb_name"],
84
+ entry["chain_id"],
85
+ entry["genes"] or "",
86
+ ]
87
+ )
88
  choices.append(choice)
89
 
90
  if choices:
get_index.py CHANGED
@@ -18,7 +18,8 @@ def get_engines(index_repo: str, model_repo: str):
18
  )
19
  sys.path.append(str(local_arch_path))
20
  from protein_index import ( # pylint: disable=import-error,import-outside-toplevel
21
- ProteinSearchEngine, ProteinIndexError
 
22
  )
23
 
24
  subindex_paths = glob(str(index_path / "*/"))
 
18
  )
19
  sys.path.append(str(local_arch_path))
20
  from protein_index import ( # pylint: disable=import-error,import-outside-toplevel
21
+ ProteinSearchEngine,
22
+ ProteinIndexError,
23
  )
24
 
25
  subindex_paths = glob(str(index_path / "*/"))