SagiPolaczek commited on
Commit
ff01709
β€’
1 Parent(s): d61ac75

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +5 -4
  2. app.py +126 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Biomed Multi Alignment Protein Protein Interaction
3
- emoji: 🌍
4
- colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.5.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Biomed-multi-alignment Protein-Protein-Interaction
3
+ emoji: 🐁
4
+ colorFrom: gray
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Demo for MAMMAL approch Protein-Protein Interaction query
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import torch
4
+ from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
5
+ from mammal.model import Mammal
6
+ from mammal.keys import *
7
+
8
+
9
+ model_path = "ibm/biomed.omics.bl.sm.ma-ted-458m"
10
+ # Load Model
11
+ model = Mammal.from_pretrained(model_path)
12
+ model.eval()
13
+
14
+ # Load Tokenizer
15
+ tokenizer_op = ModularTokenizerOp.from_pretrained(model_path)
16
+
17
+ # token for positive binding
18
+ positive_token_id = tokenizer_op.get_token_id("<1>")
19
+
20
+ # Default input proteins
21
+ protein_calmodulin = "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK"
22
+ protein_calcineurin = "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ"
23
+
24
+
25
+ def format_prompt(prot1, prot2):
26
+ # Formatting prompt to match pre-training syntax
27
+ return f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
28
+
29
+
30
+ def run_prompt(prompt):
31
+ # Create and load sample
32
+ sample_dict = dict()
33
+ sample_dict[ENCODER_INPUTS_STR] = prompt
34
+
35
+ # Tokenize
36
+ sample_dict = tokenizer_op(
37
+ sample_dict=sample_dict,
38
+ key_in=ENCODER_INPUTS_STR,
39
+ key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
40
+ key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
41
+ )
42
+ sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
43
+ sample_dict[ENCODER_INPUTS_TOKENS]
44
+ )
45
+ sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
46
+ sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
47
+ )
48
+
49
+ # Generate Prediction
50
+ batch_dict = model.generate(
51
+ [sample_dict],
52
+ output_scores=True,
53
+ return_dict_in_generate=True,
54
+ max_new_tokens=5,
55
+ )
56
+
57
+ # Get output
58
+ generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
59
+ score = batch_dict["model.out.scores"][0][1][positive_token_id].item()
60
+
61
+ return generated_output, score
62
+
63
+
64
+ def create_and_run_prompt(prot1, prot2):
65
+ prompt = format_prompt(prot1, prot2)
66
+ res = prompt, *run_prompt(prompt=prompt)
67
+ return res
68
+
69
+
70
+ def create_application():
71
+ markup_text = f"""
72
+ # Mammal based Protein-Protein Interaction (PPI) demonstration
73
+
74
+ Given two protein sequences, estimate if the proteins interact or not.
75
+
76
+ ### Using the model from
77
+
78
+ ```{model_path} ```
79
+ """
80
+
81
+ with gr.Blocks() as demo:
82
+ gr.Markdown(markup_text)
83
+ with gr.Row():
84
+ prot1 = gr.Textbox(
85
+ label="Protein 1 sequence",
86
+ # info="standard",
87
+ interactive=True,
88
+ lines=1,
89
+ value=protein_calmodulin,
90
+ )
91
+ prot2 = gr.Textbox(
92
+ label="Protein 2 sequence",
93
+ # info="standard",
94
+ interactive=True,
95
+ lines=1,
96
+ value=protein_calcineurin,
97
+ )
98
+ with gr.Row():
99
+ run_mammal = gr.Button(
100
+ "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
101
+ )
102
+ with gr.Row():
103
+ prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
104
+
105
+ with gr.Row():
106
+ decoded = gr.Textbox(label="Mammal output")
107
+ run_mammal.click(
108
+ fn=create_and_run_prompt,
109
+ inputs=[prot1, prot2],
110
+ outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
111
+ )
112
+ with gr.Row():
113
+ gr.Markdown(
114
+ "```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
115
+ )
116
+
117
+ return demo
118
+
119
+
120
+ def main():
121
+ demo = create_application()
122
+ demo.launch(show_error=True, share=True)
123
+
124
+
125
+ if __name__ == "__main__":
126
+ main()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # for the mammal demo app
2
+ mammal @ git+https://github.com/BiomedSciAI/biomed-multi-alignment.git