biomed-multi-alignment

Running

App Files Files Community

matanninio commited on Nov 26, 2024

Commit

71382c0

1 Parent(s): 81fb8a8

first attemt on unified test - the actual use case needs to be clearer

Browse files

Files changed (4) hide show

.pre-commit-config.yaml +49 -0
README.md +2 -2
app.py +173 -41
requirements.txt +1 -0

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+exclude: .*\.pdb$
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-case-conflict
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+  - repo: https://github.com/psf/black
+    rev: 24.8.0
+    hooks:
+      - id: black
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+        args:
+          -  "--ignore=E203,E266,E501,F405,F403,W503"
+          -  "--statistics"
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.6.5
+    hooks:
+      - id: ruff
+        args:
+        - "--fix"
+        - "--select"
+        - "UP,PT,I,E"#,F,W,C90,I,N,F405,E402" # Specify the rules to select
+        - "--line-length"
+        - "88"
+        - "--exit-non-zero-on-fix"
+        - "--ignore"
+        - "F405,F403,E501,E402,PT018,PT015,E722,E741"
+        types_or: [ python, pyi] #, jupyter ]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.13.0
+    hooks:
+      - id: mypy
+  - repo: https://github.com/srstevenson/nb-clean
+    rev: "2.4.0"
+    hooks:
+      - id: nb-clean
+        args:
+          - --remove-empty-cells
+          - --preserve-cell-outputs

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Biomed-multi-alignment Protein-Protein-Interaction
 emoji: 🐁
 colorFrom: gray
 colorTo: purple
@@ -8,7 +8,7 @@ sdk_version: 5.4.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Demo for MAMMAL approch Protein-Protein Interaction query
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Biomed-multi-alignment (PPI and DTI)
 emoji: 🐁
 colorFrom: gray
 colorTo: purple
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Demo for MAMMAL approch Protein-Protein Interaction and Drug-Target Binding Affinity
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,112 +1,244 @@
 import gradio as gr
 import torch
 from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
-from mammal.model import Mammal
 from mammal.keys import *
-model_path="ibm/biomed.omics.bl.sm.ma-ted-458m"
-# Load Model
-model = Mammal.from_pretrained(model_path)
-model.eval()
-# Load Tokenizer
-tokenizer_op = ModularTokenizerOp.from_pretrained(model_path)
-#token for positive binding
-positive_token_id=tokenizer_op.get_token_id("<1>")
 # Default input proteins
 protein_calmodulin = "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK"
 protein_calcineurin = "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ"
-def format_prompt(prot1,prot2):
     # Formatting prompt to match pre-training syntax
     return f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
 def run_prompt(prompt):
     # Create and load sample
     sample_dict = dict()
     sample_dict[ENCODER_INPUTS_STR] = prompt
     # Tokenize
-    sample_dict=tokenizer_op(
         sample_dict=sample_dict,
         key_in=ENCODER_INPUTS_STR,
         key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
         key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
     )
-    sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(sample_dict[ENCODER_INPUTS_TOKENS])
-    sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(sample_dict[ENCODER_INPUTS_ATTENTION_MASK])
     # Generate Prediction
-    batch_dict = model.generate(
         [sample_dict],
         output_scores=True,
         return_dict_in_generate=True,
         max_new_tokens=5,
-)
     # Get output
-    generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
-    score = batch_dict['model.out.scores'][0][1][positive_token_id].item()
-    return generated_output,score
-def create_and_run_prompt(prot1, prot2):
-    prompt = format_prompt(prot1, prot2)
-    res=prompt, *run_prompt(prompt=prompt)
     return res
-def create_application():
     markup_text = f"""
 # Mammal based Protein-Protein Interaction (PPI) demonstration
 Given two protein sequences, estimate if the proteins interact or not.
-### Using the model from
- ```{model_path} ```
 """
-    with gr.Blocks() as demo:
         gr.Markdown(markup_text)
         with gr.Row():
             prot1 = gr.Textbox(
                 label="Protein 1 sequence",
                 # info="standard",
                 interactive=True,
-                lines=1,
                 value=protein_calmodulin,
             )
             prot2 = gr.Textbox(
                 label="Protein 2 sequence",
                 # info="standard",
                 interactive=True,
-                lines=1,
                 value=protein_calcineurin,
             )
         with gr.Row():
-            run_mammal = gr.Button("Run Mammal prompt for Protein-Protein Interaction",variant='primary')
         with gr.Row():
-            prompt_box = gr.Textbox(label="Mammal prompt",lines=5)
         with gr.Row():
             decoded = gr.Textbox(label="Mammal output")
             run_mammal.click(
                 fn=create_and_run_prompt,
-                inputs=[prot1,prot2],
-                outputs=[prompt_box,decoded,gr.Number(label='PPI score')]
             )
         with gr.Row():
-            gr.Markdown("```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting")
-    return demo
 def main():
     demo = create_application()

 import gradio as gr
 import torch
 from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
+from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
 from mammal.keys import *
+from mammal.model import Mammal
+model_paths = dict()
+# Protein protein interaction:
+ppi = "Protein-Protein Interaction (PPI)"
+model_paths[ppi] = "ibm/biomed.omics.bl.sm.ma-ted-458m"
+#
+dti = "Drug-Target Binding Affinity"
+model_paths[dti] = "ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd"
+# load models (should probably be lazy)
+models = dict()
+tokenizer_op = dict()
+for task, model_path in model_paths.items():
+    if task not in models:
+        models[task] = Mammal.from_pretrained(model_path)
+        models[task].eval()
+        # Load Tokenizer
+        tokenizer_op[task] = ModularTokenizerOp.from_pretrained(model_path)
+### PPI:
+# token for positive binding
+positive_token_id = tokenizer_op[ppi].get_token_id("<1>")
 # Default input proteins
 protein_calmodulin = "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK"
 protein_calcineurin = "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ"
+def format_prompt_ppi(prot1, prot2):
     # Formatting prompt to match pre-training syntax
     return f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
 def run_prompt(prompt):
     # Create and load sample
     sample_dict = dict()
     sample_dict[ENCODER_INPUTS_STR] = prompt
     # Tokenize
+    sample_dict = tokenizer_op[ppi](
         sample_dict=sample_dict,
         key_in=ENCODER_INPUTS_STR,
         key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
         key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
     )
+    sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
+        sample_dict[ENCODER_INPUTS_TOKENS]
+    )
+    sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
+        sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
+    )
     # Generate Prediction
+    batch_dict = models[ppi].generate(
         [sample_dict],
         output_scores=True,
         return_dict_in_generate=True,
         max_new_tokens=5,
+    )
     # Get output
+    generated_output = tokenizer_op[ppi]._tokenizer.decode(batch_dict[CLS_PRED][0])
+    score = batch_dict["model.out.scores"][0][1][positive_token_id].item()
+    return generated_output, score
+def create_and_run_prompt(protein1, protein2):
+    prompt = format_prompt_ppi(protein1, protein2)
+    res = prompt, *run_prompt(prompt=prompt)
     return res
+def create_ppi_demo():
     markup_text = f"""
 # Mammal based Protein-Protein Interaction (PPI) demonstration
 Given two protein sequences, estimate if the proteins interact or not.
+### Using the model from
+ ```{model_paths[ppi]} ```
 """
+    with gr.Group() as ppi_demo:
         gr.Markdown(markup_text)
         with gr.Row():
             prot1 = gr.Textbox(
                 label="Protein 1 sequence",
                 # info="standard",
                 interactive=True,
+                lines=3,
                 value=protein_calmodulin,
             )
             prot2 = gr.Textbox(
                 label="Protein 2 sequence",
                 # info="standard",
                 interactive=True,
+                lines=3,
                 value=protein_calcineurin,
             )
         with gr.Row():
+            run_mammal = gr.Button(
+                "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
+            )
         with gr.Row():
+            prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
         with gr.Row():
             decoded = gr.Textbox(label="Mammal output")
             run_mammal.click(
                 fn=create_and_run_prompt,
+                inputs=[prot1, prot2],
+                outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
             )
         with gr.Row():
+            gr.Markdown(
+                "```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
+            )
+        ppi_demo.visible = False
+        return ppi_demo
+### DTI:
+# input
+target_seq = "NLMKRCTRGFRKLGKCTTLEEEKCKTLYPRGQCTCSDSKMNTHSCDCKSC"
+drug_seq = "CC(=O)NCCC1=CNc2c1cc(OC)cc2"
+# token for positive binding
+positive_token_id = tokenizer_op[dti].get_token_id("<1>")
+def format_prompt_dti(prot, drug):
+    sample_dict = {"target_seq": target_seq, "drug_seq": drug_seq}
+    sample_dict = DtiBindingdbKdTask.data_preprocessing(
+        sample_dict=sample_dict,
+        tokenizer_op=tokenizer_op[dti],
+        target_sequence_key="target_seq",
+        drug_sequence_key="drug_seq",
+        norm_y_mean=None,
+        norm_y_std=None,
+        device=models[dti].device,
+    )
+    return sample_dict
+def create_and_run_prompt_dtb(prot, drug):
+    sample_dict = format_prompt_dti(prot, drug)
+    # Post-process the model's output
+    # batch_dict = model_dti.forward_encoder_only([sample_dict])
+    batch_dict = models[dti].forward_encoder_only([sample_dict])
+    batch_dict = DtiBindingdbKdTask.process_model_output(
+        batch_dict,
+        scalars_preds_processed_key="model.out.dti_bindingdb_kd",
+        norm_y_mean=5.79384684128215,
+        norm_y_std=1.33808027428196,
+    )
+    ans = [
+        "model.out.dti_bindingdb_kd",
+        float(batch_dict["model.out.dti_bindingdb_kd"][0]),
+    ]
+    res = sample_dict["data.query.encoder_input"], *ans
+    return res
+def create_tdb_demo():
+    markup_text = f"""
+# Mammal based Target-Drug binding affinity demonstration
+Given a protein sequence and a drug (in SMILES), estimate the binding affinity.
+### Using the model from
+ ```{model_paths[dti]} ```
+"""
+    with gr.Group() as tdb_demo:
+        gr.Markdown(markup_text)
+        with gr.Row():
+            prot = gr.Textbox(
+                label="Protein sequence",
+                # info="standard",
+                interactive=True,
+                lines=3,
+                value=target_seq,
+            )
+            drug = gr.Textbox(
+                label="drug sequence (SMILES)",
+                # info="standard",
+                interactive=True,
+                lines=3,
+                value=drug_seq,
+            )
+        with gr.Row():
+            run_mammal = gr.Button(
+                "Run Mammal prompt for Target Drug Affinity", variant="primary"
+            )
+        with gr.Row():
+            prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
+        with gr.Row():
+            decoded = gr.Textbox(label="Mammal output")
+            run_mammal.click(
+                fn=create_and_run_prompt_dtb,
+                inputs=[prot, drug],
+                outputs=[prompt_box, decoded, gr.Number(label="DTI score")],
+            )
+        tdb_demo.visible = False
+        return tdb_demo
+def create_application():
+    with gr.Blocks() as demo:
+        main_dropdown = gr.Dropdown(choices=["select demo", ppi, dti])
+        main_dropdown.interactive = True
+        ppi_demo = create_ppi_demo()
+        dtb_demo = create_tdb_demo()
+        def set_ppi_vis(main_text):
+            return gr.Group(visible=main_text == ppi), gr.Group(
+                visible=main_text == dti
+            )
+        main_dropdown.change(
+            set_ppi_vis, inputs=main_dropdown, outputs=[ppi_demo, dtb_demo]
+        )
+        return demo
 def main():
     demo = create_application()

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 # for the mammal demo app
 mammal @ git+https://github.com/BiomedSciAI/biomed-multi-alignment.git

 # for the mammal demo app
 mammal @ git+https://github.com/BiomedSciAI/biomed-multi-alignment.git
+pytdc