matanninio's picture
spelling, wording, and a bug in the PPI prompt builder
c45ba32
raw
history blame
6.83 kB
import gradio as gr
import torch
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
from mammal.keys import (
CLS_PRED,
ENCODER_INPUTS_ATTENTION_MASK,
ENCODER_INPUTS_STR,
ENCODER_INPUTS_TOKENS,
SCORES,
)
from mammal.model import Mammal
from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
class TcrTask(MammalTask):
def __init__(self, model_dict):
super().__init__(
name="Mammal based TCRbeta-epitope binding affinity", model_dict=model_dict
)
self.description = "Mammal based TCRbeta-epitope binding affinity (TCR)"
self.examples = {
"tcr_beta_seq": "NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSYSWDRVLEQYFGPGTRLTVT",
"epitope_seq": "LLQTGIHVRVSQPSL",
}
self.markup_text = """
# Mammal based Mammal based TCRbeta-epitope binding affinity demonstration
Given a TCR beta chain and epitope amino acid sequences, estimate the binding affinity score.
"""
def create_prompt(self, tcr_beta_seq, epitope_seq):
prompt = (
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
+ f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_TCR_BETA_VDJ><SEQUENCE_NATURAL_START>{tcr_beta_seq}<SEQUENCE_NATURAL_END>"
+ f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_EPITOPE><SEQUENCE_NATURAL_START>{epitope_seq}<SEQUENCE_NATURAL_END><EOS>"
)
return prompt
def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
"""convert sample_inputs to sample_dict including creating a proper prompt
Args:
sample_inputs (dict): dictionary containing the inputs to the model
model_holder (MammalObjectBroker): model holder
Returns:
dict: sample_dict for feeding into model
"""
sample_dict = dict()
sample_dict[ENCODER_INPUTS_STR] = self.create_prompt(**sample_inputs)
tokenizer_op = model_holder.tokenizer_op
model = model_holder.model
tokenizer_op(
sample_dict=sample_dict,
key_in=ENCODER_INPUTS_STR,
key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
)
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
sample_dict[ENCODER_INPUTS_TOKENS], device=model.device
)
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
sample_dict[ENCODER_INPUTS_ATTENTION_MASK], device=model.device
)
return sample_dict
def run_model(self, sample_dict, model: Mammal):
# Generate Prediction
batch_dict = model.generate(
[sample_dict],
output_scores=True,
return_dict_in_generate=True,
max_new_tokens=5,
)
return batch_dict
@staticmethod
def positive_token_id(tokenizer_op: ModularTokenizerOp):
"""token for positive binding
Args:
model (MammalTrainedModel): model holding tokenizer
Returns:
int: id of positive binding token
"""
return tokenizer_op.get_token_id("<1>")
@staticmethod
def negative_token_id(tokenizer_op: ModularTokenizerOp):
"""token for negative binding
Args:
model (MammalTrainedModel): model holding tokenizer
Returns:
int: id of negative binding token
"""
return tokenizer_op.get_token_id("<0>")
def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
"""
Extract predicted class and scores
"""
# positive_token_id = self.positive_token_id(tokenizer_op)
# negative_token_id = self.negative_token_id(tokenizer_op)
negative_token_id = tokenizer_op.get_token_id("<0>")
positive_token_id = tokenizer_op.get_token_id("<1>")
label_id_to_int = {
negative_token_id: 0,
positive_token_id: 1,
}
classification_position = 1
decoder_output = batch_dict[CLS_PRED][0]
decoder_output_scores = batch_dict[SCORES][0]
if decoder_output_scores is not None:
scores = decoder_output_scores[classification_position, positive_token_id]
else:
scores = [None]
ans = [
tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0]),
label_id_to_int.get(int(decoder_output[classification_position]), -1),
scores.item(),
]
return ans
def create_and_run_prompt(self, model_name, tcr_beta_seq, epitope_seq):
model_holder = self.model_dict[model_name]
inputs = {
"tcr_beta_seq": tcr_beta_seq,
"epitope_seq": epitope_seq,
}
sample_dict = self.crate_sample_dict(
sample_inputs=inputs, model_holder=model_holder
)
prompt = sample_dict[ENCODER_INPUTS_STR]
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
res = prompt, *self.decode_output(
batch_dict, tokenizer_op=model_holder.tokenizer_op
)
return res
def create_demo(self, model_name_widget):
with gr.Group() as demo:
gr.Markdown(self.markup_text)
with gr.Row():
tcr_textbox = gr.Textbox(
label="T-cell receptor beta sequence",
# info="standard",
interactive=True,
lines=3,
value=self.examples["tcr_beta_seq"],
)
epitope_textbox = gr.Textbox(
label="Epitope sequence",
# info="standard",
interactive=True,
lines=3,
value=self.examples["epitope_seq"],
)
with gr.Row():
run_mammal = gr.Button(
"Run Mammal prompt for TCL-Epitope Interaction",
variant="primary",
)
with gr.Row():
prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
with gr.Row():
decoded = gr.Textbox(label="Mammal output")
predicted_class = gr.Textbox(label="Mammal prediction")
binding_score = gr.Number(label="Binding score")
run_mammal.click(
fn=self.create_and_run_prompt,
inputs=[model_name_widget, tcr_textbox, epitope_textbox],
outputs=[prompt_box, decoded, predicted_class, binding_score],
)
demo.visible = False
return demo