matanninio commited on
Commit
71382c0
·
1 Parent(s): 81fb8a8

first attemt on unified test - the actual use case needs to be clearer

Browse files
Files changed (4) hide show
  1. .pre-commit-config.yaml +49 -0
  2. README.md +2 -2
  3. app.py +173 -41
  4. requirements.txt +1 -0
.pre-commit-config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exclude: .*\.pdb$
2
+
3
+ repos:
4
+ - repo: https://github.com/pre-commit/pre-commit-hooks
5
+ rev: v4.6.0
6
+ hooks:
7
+ - id: check-case-conflict
8
+ - id: end-of-file-fixer
9
+ - id: mixed-line-ending
10
+ - id: trailing-whitespace
11
+ - repo: https://github.com/psf/black
12
+ rev: 24.8.0
13
+ hooks:
14
+ - id: black
15
+ - repo: https://github.com/PyCQA/flake8
16
+ rev: 5.0.4
17
+ hooks:
18
+ - id: flake8
19
+ args:
20
+ - "--ignore=E203,E266,E501,F405,F403,W503"
21
+ - "--statistics"
22
+
23
+ - repo: https://github.com/astral-sh/ruff-pre-commit
24
+ # Ruff version.
25
+ rev: v0.6.5
26
+ hooks:
27
+ - id: ruff
28
+ args:
29
+ - "--fix"
30
+ - "--select"
31
+ - "UP,PT,I,E"#,F,W,C90,I,N,F405,E402" # Specify the rules to select
32
+ - "--line-length"
33
+ - "88"
34
+ - "--exit-non-zero-on-fix"
35
+ - "--ignore"
36
+ - "F405,F403,E501,E402,PT018,PT015,E722,E741"
37
+ types_or: [ python, pyi] #, jupyter ]
38
+ - repo: https://github.com/pre-commit/mirrors-mypy
39
+ rev: v1.13.0
40
+ hooks:
41
+ - id: mypy
42
+
43
+ - repo: https://github.com/srstevenson/nb-clean
44
+ rev: "2.4.0"
45
+ hooks:
46
+ - id: nb-clean
47
+ args:
48
+ - --remove-empty-cells
49
+ - --preserve-cell-outputs
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Biomed-multi-alignment Protein-Protein-Interaction
3
  emoji: 🐁
4
  colorFrom: gray
5
  colorTo: purple
@@ -8,7 +8,7 @@ sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Demo for MAMMAL approch Protein-Protein Interaction query
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Biomed-multi-alignment (PPI and DTI)
3
  emoji: 🐁
4
  colorFrom: gray
5
  colorTo: purple
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Demo for MAMMAL approch Protein-Protein Interaction and Drug-Target Binding Affinity
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,112 +1,244 @@
1
  import gradio as gr
2
-
3
  import torch
4
  from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
5
- from mammal.model import Mammal
6
  from mammal.keys import *
 
 
 
 
 
 
 
7
 
 
 
 
8
 
9
 
10
- model_path="ibm/biomed.omics.bl.sm.ma-ted-458m"
11
- # Load Model
12
- model = Mammal.from_pretrained(model_path)
13
- model.eval()
14
 
15
- # Load Tokenizer
16
- tokenizer_op = ModularTokenizerOp.from_pretrained(model_path)
17
 
18
- #token for positive binding
19
- positive_token_id=tokenizer_op.get_token_id("<1>")
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Default input proteins
22
  protein_calmodulin = "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK"
23
  protein_calcineurin = "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ"
24
 
25
 
26
- def format_prompt(prot1,prot2):
27
  # Formatting prompt to match pre-training syntax
28
  return f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
29
 
 
30
  def run_prompt(prompt):
31
  # Create and load sample
32
  sample_dict = dict()
33
  sample_dict[ENCODER_INPUTS_STR] = prompt
34
 
35
  # Tokenize
36
- sample_dict=tokenizer_op(
37
  sample_dict=sample_dict,
38
  key_in=ENCODER_INPUTS_STR,
39
  key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
40
  key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
41
  )
42
- sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(sample_dict[ENCODER_INPUTS_TOKENS])
43
- sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(sample_dict[ENCODER_INPUTS_ATTENTION_MASK])
44
-
 
 
 
45
 
46
  # Generate Prediction
47
- batch_dict = model.generate(
48
  [sample_dict],
49
  output_scores=True,
50
  return_dict_in_generate=True,
51
  max_new_tokens=5,
52
- )
53
-
54
 
55
  # Get output
56
- generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
57
- score = batch_dict['model.out.scores'][0][1][positive_token_id].item()
58
-
59
- return generated_output,score
60
-
61
- def create_and_run_prompt(prot1, prot2):
62
- prompt = format_prompt(prot1, prot2)
63
- res=prompt, *run_prompt(prompt=prompt)
 
64
  return res
65
 
66
- def create_application():
 
67
  markup_text = f"""
68
  # Mammal based Protein-Protein Interaction (PPI) demonstration
69
 
70
  Given two protein sequences, estimate if the proteins interact or not.
71
 
72
- ### Using the model from
73
 
74
- ```{model_path} ```
75
  """
76
-
77
- with gr.Blocks() as demo:
78
  gr.Markdown(markup_text)
79
  with gr.Row():
80
  prot1 = gr.Textbox(
81
  label="Protein 1 sequence",
82
  # info="standard",
83
  interactive=True,
84
- lines=1,
85
  value=protein_calmodulin,
86
  )
87
  prot2 = gr.Textbox(
88
  label="Protein 2 sequence",
89
  # info="standard",
90
  interactive=True,
91
- lines=1,
92
  value=protein_calcineurin,
93
  )
94
  with gr.Row():
95
- run_mammal = gr.Button("Run Mammal prompt for Protein-Protein Interaction",variant='primary')
 
 
96
  with gr.Row():
97
- prompt_box = gr.Textbox(label="Mammal prompt",lines=5)
98
-
99
  with gr.Row():
100
  decoded = gr.Textbox(label="Mammal output")
101
  run_mammal.click(
102
  fn=create_and_run_prompt,
103
- inputs=[prot1,prot2],
104
- outputs=[prompt_box,decoded,gr.Number(label='PPI score')]
105
  )
106
  with gr.Row():
107
- gr.Markdown("```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting")
108
-
109
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def main():
112
  demo = create_application()
 
1
  import gradio as gr
 
2
  import torch
3
  from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
4
+ from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
5
  from mammal.keys import *
6
+ from mammal.model import Mammal
7
+
8
+ model_paths = dict()
9
+
10
+ # Protein protein interaction:
11
+ ppi = "Protein-Protein Interaction (PPI)"
12
+ model_paths[ppi] = "ibm/biomed.omics.bl.sm.ma-ted-458m"
13
 
14
+ #
15
+ dti = "Drug-Target Binding Affinity"
16
+ model_paths[dti] = "ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd"
17
 
18
 
19
+ # load models (should probably be lazy)
 
 
 
20
 
21
+ models = dict()
22
+ tokenizer_op = dict()
23
 
24
+
25
+ for task, model_path in model_paths.items():
26
+ if task not in models:
27
+ models[task] = Mammal.from_pretrained(model_path)
28
+ models[task].eval()
29
+ # Load Tokenizer
30
+ tokenizer_op[task] = ModularTokenizerOp.from_pretrained(model_path)
31
+
32
+
33
+ ### PPI:
34
+ # token for positive binding
35
+ positive_token_id = tokenizer_op[ppi].get_token_id("<1>")
36
 
37
  # Default input proteins
38
  protein_calmodulin = "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK"
39
  protein_calcineurin = "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ"
40
 
41
 
42
+ def format_prompt_ppi(prot1, prot2):
43
  # Formatting prompt to match pre-training syntax
44
  return f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
45
 
46
+
47
  def run_prompt(prompt):
48
  # Create and load sample
49
  sample_dict = dict()
50
  sample_dict[ENCODER_INPUTS_STR] = prompt
51
 
52
  # Tokenize
53
+ sample_dict = tokenizer_op[ppi](
54
  sample_dict=sample_dict,
55
  key_in=ENCODER_INPUTS_STR,
56
  key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
57
  key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
58
  )
59
+ sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
60
+ sample_dict[ENCODER_INPUTS_TOKENS]
61
+ )
62
+ sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
63
+ sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
64
+ )
65
 
66
  # Generate Prediction
67
+ batch_dict = models[ppi].generate(
68
  [sample_dict],
69
  output_scores=True,
70
  return_dict_in_generate=True,
71
  max_new_tokens=5,
72
+ )
 
73
 
74
  # Get output
75
+ generated_output = tokenizer_op[ppi]._tokenizer.decode(batch_dict[CLS_PRED][0])
76
+ score = batch_dict["model.out.scores"][0][1][positive_token_id].item()
77
+
78
+ return generated_output, score
79
+
80
+
81
+ def create_and_run_prompt(protein1, protein2):
82
+ prompt = format_prompt_ppi(protein1, protein2)
83
+ res = prompt, *run_prompt(prompt=prompt)
84
  return res
85
 
86
+
87
+ def create_ppi_demo():
88
  markup_text = f"""
89
  # Mammal based Protein-Protein Interaction (PPI) demonstration
90
 
91
  Given two protein sequences, estimate if the proteins interact or not.
92
 
93
+ ### Using the model from
94
 
95
+ ```{model_paths[ppi]} ```
96
  """
97
+ with gr.Group() as ppi_demo:
 
98
  gr.Markdown(markup_text)
99
  with gr.Row():
100
  prot1 = gr.Textbox(
101
  label="Protein 1 sequence",
102
  # info="standard",
103
  interactive=True,
104
+ lines=3,
105
  value=protein_calmodulin,
106
  )
107
  prot2 = gr.Textbox(
108
  label="Protein 2 sequence",
109
  # info="standard",
110
  interactive=True,
111
+ lines=3,
112
  value=protein_calcineurin,
113
  )
114
  with gr.Row():
115
+ run_mammal = gr.Button(
116
+ "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
117
+ )
118
  with gr.Row():
119
+ prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
120
+
121
  with gr.Row():
122
  decoded = gr.Textbox(label="Mammal output")
123
  run_mammal.click(
124
  fn=create_and_run_prompt,
125
+ inputs=[prot1, prot2],
126
+ outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
127
  )
128
  with gr.Row():
129
+ gr.Markdown(
130
+ "```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
131
+ )
132
+ ppi_demo.visible = False
133
+ return ppi_demo
134
+
135
+
136
+ ### DTI:
137
+ # input
138
+ target_seq = "NLMKRCTRGFRKLGKCTTLEEEKCKTLYPRGQCTCSDSKMNTHSCDCKSC"
139
+ drug_seq = "CC(=O)NCCC1=CNc2c1cc(OC)cc2"
140
+
141
+
142
+ # token for positive binding
143
+ positive_token_id = tokenizer_op[dti].get_token_id("<1>")
144
+
145
+
146
+ def format_prompt_dti(prot, drug):
147
+ sample_dict = {"target_seq": target_seq, "drug_seq": drug_seq}
148
+ sample_dict = DtiBindingdbKdTask.data_preprocessing(
149
+ sample_dict=sample_dict,
150
+ tokenizer_op=tokenizer_op[dti],
151
+ target_sequence_key="target_seq",
152
+ drug_sequence_key="drug_seq",
153
+ norm_y_mean=None,
154
+ norm_y_std=None,
155
+ device=models[dti].device,
156
+ )
157
+ return sample_dict
158
+
159
+
160
+ def create_and_run_prompt_dtb(prot, drug):
161
+ sample_dict = format_prompt_dti(prot, drug)
162
+ # Post-process the model's output
163
+ # batch_dict = model_dti.forward_encoder_only([sample_dict])
164
+ batch_dict = models[dti].forward_encoder_only([sample_dict])
165
+ batch_dict = DtiBindingdbKdTask.process_model_output(
166
+ batch_dict,
167
+ scalars_preds_processed_key="model.out.dti_bindingdb_kd",
168
+ norm_y_mean=5.79384684128215,
169
+ norm_y_std=1.33808027428196,
170
+ )
171
+ ans = [
172
+ "model.out.dti_bindingdb_kd",
173
+ float(batch_dict["model.out.dti_bindingdb_kd"][0]),
174
+ ]
175
+ res = sample_dict["data.query.encoder_input"], *ans
176
+ return res
177
+
178
+
179
+ def create_tdb_demo():
180
+ markup_text = f"""
181
+ # Mammal based Target-Drug binding affinity demonstration
182
+
183
+ Given a protein sequence and a drug (in SMILES), estimate the binding affinity.
184
+
185
+ ### Using the model from
186
+
187
+ ```{model_paths[dti]} ```
188
+ """
189
+ with gr.Group() as tdb_demo:
190
+ gr.Markdown(markup_text)
191
+ with gr.Row():
192
+ prot = gr.Textbox(
193
+ label="Protein sequence",
194
+ # info="standard",
195
+ interactive=True,
196
+ lines=3,
197
+ value=target_seq,
198
+ )
199
+ drug = gr.Textbox(
200
+ label="drug sequence (SMILES)",
201
+ # info="standard",
202
+ interactive=True,
203
+ lines=3,
204
+ value=drug_seq,
205
+ )
206
+ with gr.Row():
207
+ run_mammal = gr.Button(
208
+ "Run Mammal prompt for Target Drug Affinity", variant="primary"
209
+ )
210
+ with gr.Row():
211
+ prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
212
+
213
+ with gr.Row():
214
+ decoded = gr.Textbox(label="Mammal output")
215
+ run_mammal.click(
216
+ fn=create_and_run_prompt_dtb,
217
+ inputs=[prot, drug],
218
+ outputs=[prompt_box, decoded, gr.Number(label="DTI score")],
219
+ )
220
+ tdb_demo.visible = False
221
+ return tdb_demo
222
+
223
+
224
+ def create_application():
225
+
226
+ with gr.Blocks() as demo:
227
+ main_dropdown = gr.Dropdown(choices=["select demo", ppi, dti])
228
+ main_dropdown.interactive = True
229
+ ppi_demo = create_ppi_demo()
230
+ dtb_demo = create_tdb_demo()
231
+
232
+ def set_ppi_vis(main_text):
233
+ return gr.Group(visible=main_text == ppi), gr.Group(
234
+ visible=main_text == dti
235
+ )
236
+
237
+ main_dropdown.change(
238
+ set_ppi_vis, inputs=main_dropdown, outputs=[ppi_demo, dtb_demo]
239
+ )
240
+ return demo
241
+
242
 
243
  def main():
244
  demo = create_application()
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  # for the mammal demo app
2
  mammal @ git+https://github.com/BiomedSciAI/biomed-multi-alignment.git
 
 
1
  # for the mammal demo app
2
  mammal @ git+https://github.com/BiomedSciAI/biomed-multi-alignment.git
3
+ pytdc