biomed-multi-alignment

Sleeping

App Files Files Community

biomed-multi-alignment / mammal_demo /ppi_task.py

matanninio

fixed bad sting in PPI demo, moved to IBM theme for the colors

83811e8 24 days ago

raw

history blame

5.83 kB

	import gradio as gr
	import torch
	from mammal.keys import (
	CLS_PRED,
	ENCODER_INPUTS_ATTENTION_MASK,
	ENCODER_INPUTS_STR,
	ENCODER_INPUTS_TOKENS,
	)
	from mammal.model import Mammal

	from mammal_demo.demo_framework import MammalObjectBroker, MammalTask


	class PpiTask(MammalTask):
	def __init__(self, model_dict):
	super().__init__(name="Protein-Protein Interaction", model_dict=model_dict)
	self.description = "Protein-Protein Interaction (PPI)"
	self.examples = {
	"protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
	"protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
	}
	self.markup_text = f"""
	# Mammal based {self.description} demonstration

	Given two protein sequences, estimate if the proteins interact or not."""

	@staticmethod
	def positive_token_id(model_holder: MammalObjectBroker):
	"""token for positive binding

	Args:
	model (MammalTrainedModel): model holding tokenizer

	Returns:
	int: id of positive binding token
	"""
	return model_holder.tokenizer_op.get_token_id("<1>")

	def generate_prompt(self, prot1, prot2):
	"""Formatting prompt to match pre-training syntax

	Args:
	prot1 (str): sequance of protein number 1
	prot2 (str): sequance of protein number 2

	Returns:
	str: prompt
	"""
	prompt = (
	"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
	+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
	+ f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"
	+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
	+ f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
	)
	return prompt

	def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
	# Create and load sample
	sample_dict = dict()
	prompt = self.generate_prompt(*sample_inputs)
	sample_dict[ENCODER_INPUTS_STR] = prompt

	# Tokenize
	sample_dict = model_holder.tokenizer_op(
	sample_dict=sample_dict,
	key_in=ENCODER_INPUTS_STR,
	key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
	key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
	)
	sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
	sample_dict[ENCODER_INPUTS_TOKENS]
	)
	sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
	sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
	)
	return sample_dict

	def run_model(self, sample_dict, model: Mammal):
	# Generate Prediction
	batch_dict = model.generate(
	[sample_dict],
	output_scores=True,
	return_dict_in_generate=True,
	max_new_tokens=5,
	)
	return batch_dict

	def decode_output(self, batch_dict, model_holder: MammalObjectBroker):

	# Get output
	generated_output = model_holder.tokenizer_op._tokenizer.decode(
	batch_dict[CLS_PRED][0]
	)
	score = batch_dict["model.out.scores"][0][1][
	self.positive_token_id(model_holder)
	].item()

	return generated_output, score

	def create_and_run_prompt(self, model_name, protein1, protein2):
	model_holder = self.model_dict[model_name]
	sample_inputs = {"prot1": protein1, "prot2": protein2}
	sample_dict = self.crate_sample_dict(
	sample_inputs=sample_inputs, model_holder=model_holder
	)
	prompt = sample_dict[ENCODER_INPUTS_STR]
	batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
	res = prompt, *self.decode_output(batch_dict, model_holder=model_holder)
	return res

	def create_demo(self, model_name_widget: gr.component):

	# """
	# ### Using the model from

	# ```{model} ```
	# """
	with gr.Group() as demo:
	gr.Markdown(self.markup_text)
	with gr.Row():
	prot1 = gr.Textbox(
	label="Protein 1 sequence",
	# info="standard",
	interactive=True,
	lines=3,
	value=self.examples["protein_calmodulin"],
	)
	prot2 = gr.Textbox(
	label="Protein 2 sequence",
	# info="standard",
	interactive=True,
	lines=3,
	value=self.examples["protein_calcineurin"],
	)
	with gr.Row():
	run_mammal: gr.Button = gr.Button(
	"Run Mammal prompt for Protein-Protein Interaction",
	variant="primary",
	)
	with gr.Row():
	prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
	with gr.Row():
	decoded = gr.Textbox(label="Mammal output")
	score_box = gr.Number(label="PPI score")
	run_mammal.click(
	fn=self.create_and_run_prompt,
	inputs=[model_name_widget, prot1, prot2],
	outputs=[prompt_box, decoded, score_box],
	)
	with gr.Row():
	gr.Markdown(
	"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
	)
	demo.visible = False
	return demo