biomed-multi-alignment

Sleeping

File size: 5,827 Bytes

f8080fc
 
19dfa7a
 
 
 
 
 
f8080fc
 
19dfa7a
f8080fc
 
 
 
 
 
 
 
 
 
83811e8
f8080fc
19dfa7a
f8080fc
19dfa7a
f8080fc
 
 
 
 
 
 
 
 
 
 
19dfa7a
f8080fc
 
 
 
 
 
 
 
 
19dfa7a
 
 
 
 
 
 
 
f8080fc
19dfa7a
 
f8080fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19dfa7a
 
f8080fc
 
19dfa7a
 
 
 
 
 
f8080fc
 
 
19dfa7a
f8080fc
19dfa7a
 
 
 
f8080fc
 
19dfa7a
f8080fc
 
19dfa7a
 
 
 
f8080fc
19dfa7a
 
f8080fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19dfa7a
 
f8080fc
 
 
 
 
f98cc68
f8080fc
 
 
19dfa7a
f8080fc

import gradio as gr
import torch
from mammal.keys import (
    CLS_PRED,
    ENCODER_INPUTS_ATTENTION_MASK,
    ENCODER_INPUTS_STR,
    ENCODER_INPUTS_TOKENS,
)
from mammal.model import Mammal

from mammal_demo.demo_framework import MammalObjectBroker, MammalTask


class PpiTask(MammalTask):
    def __init__(self, model_dict):
        super().__init__(name="Protein-Protein Interaction", model_dict=model_dict)
        self.description = "Protein-Protein Interaction (PPI)"
        self.examples = {
            "protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
            "protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
        }
        self.markup_text = f"""
    # Mammal based {self.description} demonstration

    Given two protein sequences, estimate if the proteins interact or not."""

    @staticmethod
    def positive_token_id(model_holder: MammalObjectBroker):
        """token for positive binding

        Args:
            model (MammalTrainedModel): model holding tokenizer

        Returns:
            int: id of positive binding token
        """
        return model_holder.tokenizer_op.get_token_id("<1>")

    def generate_prompt(self, prot1, prot2):
        """Formatting prompt to match pre-training syntax

        Args:
            prot1 (str): sequance of protein number 1
            prot2 (str): sequance of protein number 2

        Returns:
            str: prompt
        """
        prompt = (
            "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
            + "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
            + f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"
            + "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
            + f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
        )
        return prompt

    def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
        # Create and load sample
        sample_dict = dict()
        prompt = self.generate_prompt(*sample_inputs)
        sample_dict[ENCODER_INPUTS_STR] = prompt

        # Tokenize
        sample_dict = model_holder.tokenizer_op(
            sample_dict=sample_dict,
            key_in=ENCODER_INPUTS_STR,
            key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
            key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
        )
        sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
            sample_dict[ENCODER_INPUTS_TOKENS]
        )
        sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
            sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
        )
        return sample_dict

    def run_model(self, sample_dict, model: Mammal):
        # Generate Prediction
        batch_dict = model.generate(
            [sample_dict],
            output_scores=True,
            return_dict_in_generate=True,
            max_new_tokens=5,
        )
        return batch_dict

    def decode_output(self, batch_dict, model_holder: MammalObjectBroker):

        # Get output
        generated_output = model_holder.tokenizer_op._tokenizer.decode(
            batch_dict[CLS_PRED][0]
        )
        score = batch_dict["model.out.scores"][0][1][
            self.positive_token_id(model_holder)
        ].item()

        return generated_output, score

    def create_and_run_prompt(self, model_name, protein1, protein2):
        model_holder = self.model_dict[model_name]
        sample_inputs = {"prot1": protein1, "prot2": protein2}
        sample_dict = self.crate_sample_dict(
            sample_inputs=sample_inputs, model_holder=model_holder
        )
        prompt = sample_dict[ENCODER_INPUTS_STR]
        batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
        res = prompt, *self.decode_output(batch_dict, model_holder=model_holder)
        return res

    def create_demo(self, model_name_widget: gr.component):

        # """
        # ### Using the model from

        # ```{model} ```
        # """
        with gr.Group() as demo:
            gr.Markdown(self.markup_text)
            with gr.Row():
                prot1 = gr.Textbox(
                    label="Protein 1 sequence",
                    # info="standard",
                    interactive=True,
                    lines=3,
                    value=self.examples["protein_calmodulin"],
                )
                prot2 = gr.Textbox(
                    label="Protein 2 sequence",
                    # info="standard",
                    interactive=True,
                    lines=3,
                    value=self.examples["protein_calcineurin"],
                )
            with gr.Row():
                run_mammal: gr.Button = gr.Button(
                    "Run Mammal prompt for Protein-Protein Interaction",
                    variant="primary",
                )
            with gr.Row():
                prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
            with gr.Row():
                decoded = gr.Textbox(label="Mammal output")
                score_box = gr.Number(label="PPI score")
                run_mammal.click(
                    fn=self.create_and_run_prompt,
                    inputs=[model_name_widget, prot1, prot2],
                    outputs=[prompt_box, decoded, score_box],
                )
            with gr.Row():
                gr.Markdown(
                    "```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
                )
            demo.visible = False
            return demo