matanninio's picture
spelling, wording, and a bug in the PPI prompt builder
c45ba32
raw
history blame
5.83 kB
import gradio as gr
import torch
from mammal.keys import (
CLS_PRED,
ENCODER_INPUTS_ATTENTION_MASK,
ENCODER_INPUTS_STR,
ENCODER_INPUTS_TOKENS,
)
from mammal.model import Mammal
from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
class PpiTask(MammalTask):
def __init__(self, model_dict):
super().__init__(name="Protein-Protein Interaction", model_dict=model_dict)
self.description = "Protein-Protein Interaction (PPI)"
self.examples = {
"protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
"protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
}
self.markup_text = f"""
# Mammal based {self.description} demonstration
Given two protein sequences, estimate if the proteins interact or not."""
@staticmethod
def positive_token_id(model_holder: MammalObjectBroker):
"""token for positive binding
Args:
model (MammalTrainedModel): model holding tokenizer
Returns:
int: id of positive binding token
"""
return model_holder.tokenizer_op.get_token_id("<1>")
def generate_prompt(self, prot1, prot2):
"""Formatting prompt to match pre-training syntax
Args:
prot1 (str): sequance of protein number 1
prot2 (str): sequance of protein number 2
Returns:
str: prompt
"""
prompt = (
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
+ f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
+ f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
)
return prompt
def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
# Create and load sample
sample_dict = dict()
prompt = self.generate_prompt(**sample_inputs)
sample_dict[ENCODER_INPUTS_STR] = prompt
# Tokenize
sample_dict = model_holder.tokenizer_op(
sample_dict=sample_dict,
key_in=ENCODER_INPUTS_STR,
key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
)
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
sample_dict[ENCODER_INPUTS_TOKENS]
)
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
)
return sample_dict
def run_model(self, sample_dict, model: Mammal):
# Generate Prediction
batch_dict = model.generate(
[sample_dict],
output_scores=True,
return_dict_in_generate=True,
max_new_tokens=5,
)
return batch_dict
def decode_output(self, batch_dict, model_holder: MammalObjectBroker):
# Get output
generated_output = model_holder.tokenizer_op._tokenizer.decode(
batch_dict[CLS_PRED][0]
)
score = batch_dict["model.out.scores"][0][1][
self.positive_token_id(model_holder)
].item()
return generated_output, score
def create_and_run_prompt(self, model_name, protein1, protein2):
model_holder = self.model_dict[model_name]
sample_inputs = {"prot1": protein1, "prot2": protein2}
sample_dict = self.crate_sample_dict(
sample_inputs=sample_inputs, model_holder=model_holder
)
prompt = sample_dict[ENCODER_INPUTS_STR]
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
res = prompt, *self.decode_output(batch_dict, model_holder=model_holder)
return res
def create_demo(self, model_name_widget: gr.component):
# """
# ### Using the model from
# ```{model} ```
# """
with gr.Group() as demo:
gr.Markdown(self.markup_text)
with gr.Row():
prot1 = gr.Textbox(
label="Protein 1 sequence",
# info="standard",
interactive=True,
lines=3,
value=self.examples["protein_calmodulin"],
)
prot2 = gr.Textbox(
label="Protein 2 sequence",
# info="standard",
interactive=True,
lines=3,
value=self.examples["protein_calcineurin"],
)
with gr.Row():
run_mammal: gr.Button = gr.Button(
"Run Mammal prompt for Protein-Protein Interaction",
variant="primary",
)
with gr.Row():
prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
with gr.Row():
decoded = gr.Textbox(label="Mammal output")
score_box = gr.Number(label="PPI score")
run_mammal.click(
fn=self.create_and_run_prompt,
inputs=[model_name_widget, prot1, prot2],
outputs=[prompt_box, decoded, score_box],
)
with gr.Row():
gr.Markdown(
"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
)
demo.visible = False
return demo