biomed-multi-alignment

Sleeping

App Files Files Community

matanninio commited on 15 days ago

Commit

b93c8a7

•

1 Parent(s): e3cb71b

cleanup and normalization of tasks

Browse files

Files changed (4) hide show

mammal_demo/demo_framework.py +49 -19
mammal_demo/ppi_task.py +21 -31
mammal_demo/ps_task.py +7 -4
mammal_demo/tcr_task.py +18 -38

mammal_demo/demo_framework.py CHANGED Viewed

@@ -11,6 +11,8 @@ class MammalObjectBroker:
         model_path: str,
         name: str | None = None,
         task_list: list[str] | None = None,
     ) -> None:
         self.model_path = model_path
         if name is None:
@@ -22,12 +24,14 @@ class MammalObjectBroker:
             self.tasks = task_list
         self._model: Mammal | None = None
         self._tokenizer_op = None
     @property
     def model(self) -> Mammal:
         if self._model is None:
             self._model = Mammal.from_pretrained(self.model_path)
-        self._model.eval()
         return self._model
     @property
@@ -36,6 +40,11 @@ class MammalObjectBroker:
             self._tokenizer_op = ModularTokenizerOp.from_pretrained(self.model_path)
         return self._tokenizer_op
 class MammalTask(ABC):
     def __init__(self, name: str, model_dict: dict[str, MammalObjectBroker]) -> None:
@@ -44,19 +53,6 @@ class MammalTask(ABC):
         self._demo = None
         self.model_dict = model_dict
-    # @abstractmethod
-    # def _generate_prompt(self, **kwargs) -> str:
-    #     """Formatting prompt to match pre-training syntax
-    #     Args:
-    #         prot1 (_type_): _description_
-    #         prot2 (_type_): _description_
-    #     Raises:
-    #         No: _description_
-    #     """
-    #     raise NotImplementedError()
     @abstractmethod
     def crate_sample_dict(
         self, sample_inputs: dict, model_holder: MammalObjectBroker
@@ -97,10 +93,39 @@ class MammalTask(ABC):
     def decode_output(self, batch_dict, model: Mammal) -> list:
         raise NotImplementedError()
-    # self._setup()
-    # def _setup(self):
-    #     pass
 class TaskRegistry(dict[str, MammalTask]):
@@ -114,7 +139,9 @@ class TaskRegistry(dict[str, MammalTask]):
 class ModelRegistry(dict[str, MammalObjectBroker]):
     """just a dictionary with a register models"""
-    def register_model(self, model_path, task_list=None, name=None):
         """register a model and return the name of the model
         Args:
             model_path (_type_): _description_
@@ -124,7 +151,10 @@ class ModelRegistry(dict[str, MammalObjectBroker]):
             str: model name
         """
         model_holder = MammalObjectBroker(
-            model_path=model_path, task_list=task_list, name=name
         )
         self[model_holder.name] = model_holder
         return model_holder.name

         model_path: str,
         name: str | None = None,
         task_list: list[str] | None = None,
+        *,
+        force_preload=False,
     ) -> None:
         self.model_path = model_path
         if name is None:
             self.tasks = task_list
         self._model: Mammal | None = None
         self._tokenizer_op = None
+        if force_preload:
+            self.force_preload()
     @property
     def model(self) -> Mammal:
         if self._model is None:
             self._model = Mammal.from_pretrained(self.model_path)
+            self._model.eval()
         return self._model
     @property
             self._tokenizer_op = ModularTokenizerOp.from_pretrained(self.model_path)
         return self._tokenizer_op
+    def force_preload(self):
+        """pre-load the model and tokenizer (in this order)"""
+        _ = self.model
+        _ = self.tokenizer_op
 class MammalTask(ABC):
     def __init__(self, name: str, model_dict: dict[str, MammalObjectBroker]) -> None:
         self._demo = None
         self.model_dict = model_dict
     @abstractmethod
     def crate_sample_dict(
         self, sample_inputs: dict, model_holder: MammalObjectBroker
     def decode_output(self, batch_dict, model: Mammal) -> list:
         raise NotImplementedError()
+    # classification helpers
+    @staticmethod
+    def positive_token_id(tokenizer_op: ModularTokenizerOp) -> int:
+        """token for positive binding
+        Args:
+            model (MammalTrainedModel): model holding tokenizer
+        Returns:
+            int: id of positive binding token
+        """
+        return tokenizer_op.get_token_id("<1>")
+    @staticmethod
+    def negative_token_id(tokenizer_op: ModularTokenizerOp) -> int:
+        """token for negative binding
+        Args:
+            model (MammalTrainedModel): model holding tokenizer
+        Returns:
+            int: id of negative binding token
+        """
+        return tokenizer_op.get_token_id("<0>")
+    @staticmethod
+    def get_label_from_token(tokenizer_op: ModularTokenizerOp, token_id):
+        label_mapping = {
+            MammalTask.negative_token_id(tokenizer_op): "negative",
+            MammalTask.positive_token_id(tokenizer_op): "positive",
+        }
+        return label_mapping.get(token_id, token_id)
 class TaskRegistry(dict[str, MammalTask]):
 class ModelRegistry(dict[str, MammalObjectBroker]):
     """just a dictionary with a register models"""
+    def register_model(
+        self, model_path, task_list=None, name=None, *, force_preload=False
+    ):
         """register a model and return the name of the model
         Args:
             model_path (_type_): _description_
             str: model name
         """
         model_holder = MammalObjectBroker(
+            model_path=model_path,
+            task_list=task_list,
+            name=name,
+            force_preload=force_preload,
         )
         self[model_holder.name] = model_holder
         return model_holder.name

mammal_demo/ppi_task.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import gradio as gr
 import torch
 from mammal.keys import (
     CLS_PRED,
     ENCODER_INPUTS_ATTENTION_MASK,
     ENCODER_INPUTS_STR,
     ENCODER_INPUTS_TOKENS,
 )
 from mammal.model import Mammal
@@ -24,24 +26,12 @@ class PpiTask(MammalTask):
     Given two protein sequences, estimate if the proteins interact or not."""
-    @staticmethod
-    def positive_token_id(model_holder: MammalObjectBroker):
-        """token for positive binding
-        Args:
-            model (MammalTrainedModel): model holding tokenizer
-        Returns:
-            int: id of positive binding token
-        """
-        return model_holder.tokenizer_op.get_token_id("<1>")
-    def generate_prompt(self, prot1, prot2):
         """Formatting prompt to match pre-training syntax
         Args:
-            prot1 (str): sequance of protein number 1
-            prot2 (str): sequance of protein number 2
         Returns:
             str: prompt
@@ -49,9 +39,9 @@ class PpiTask(MammalTask):
         prompt = (
             "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
             + "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
-            + f"<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"
             + "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
-            + f"<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
         )
         return prompt
@@ -74,6 +64,7 @@ class PpiTask(MammalTask):
         sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
             sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
         )
         return sample_dict
     def run_model(self, sample_dict, model: Mammal):
@@ -86,27 +77,26 @@ class PpiTask(MammalTask):
         )
         return batch_dict
-    def decode_output(self, batch_dict, model_holder: MammalObjectBroker):
         # Get output
-        generated_output = model_holder.tokenizer_op._tokenizer.decode(
-            batch_dict[CLS_PRED][0]
-        )
-        score = batch_dict["model.out.scores"][0][1][
-            self.positive_token_id(model_holder)
-        ].item()
-        return generated_output, score
-    def create_and_run_prompt(self, model_name, protein1, protein2):
         model_holder = self.model_dict[model_name]
-        sample_inputs = {"prot1": protein1, "prot2": protein2}
         sample_dict = self.crate_sample_dict(
             sample_inputs=sample_inputs, model_holder=model_holder
         )
         prompt = sample_dict[ENCODER_INPUTS_STR]
         batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
-        res = prompt, *self.decode_output(batch_dict, model_holder=model_holder)
         return res
     def create_demo(self, model_name_widget: gr.component):
@@ -119,14 +109,14 @@ class PpiTask(MammalTask):
         with gr.Group() as demo:
             gr.Markdown(self.markup_text)
             with gr.Row():
-                prot1 = gr.Textbox(
                     label="Protein 1 sequence",
                     # info="standard",
                     interactive=True,
                     lines=3,
                     value=self.examples["protein_calmodulin"],
                 )
-                prot2 = gr.Textbox(
                     label="Protein 2 sequence",
                     # info="standard",
                     interactive=True,
@@ -145,7 +135,7 @@ class PpiTask(MammalTask):
                 score_box = gr.Number(label="PPI score")
                 run_mammal.click(
                     fn=self.create_and_run_prompt,
-                    inputs=[model_name_widget, prot1, prot2],
                     outputs=[prompt_box, decoded, score_box],
                 )
             with gr.Row():

 import gradio as gr
 import torch
+from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
 from mammal.keys import (
     CLS_PRED,
     ENCODER_INPUTS_ATTENTION_MASK,
     ENCODER_INPUTS_STR,
     ENCODER_INPUTS_TOKENS,
+    SCORES,
 )
 from mammal.model import Mammal
     Given two protein sequences, estimate if the proteins interact or not."""
+    def generate_prompt(self, protein_seq_1, protein_seq_2):
         """Formatting prompt to match pre-training syntax
         Args:
+            protein_seq_1 (str): sequance of protein number 1
+            protein_seq_2 (str): sequance of protein number 2
         Returns:
             str: prompt
         prompt = (
             "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
             + "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
+            + f"<SEQUENCE_NATURAL_START>{protein_seq_1}<SEQUENCE_NATURAL_END>"
             + "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
+            + f"<SEQUENCE_NATURAL_START>{protein_seq_2}<SEQUENCE_NATURAL_END><EOS>"
         )
         return prompt
         sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
             sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
         )
         return sample_dict
     def run_model(self, sample_dict, model: Mammal):
         )
         return batch_dict
+    def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
         # Get output
+        generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
+        score = batch_dict[SCORES][0][1][self.positive_token_id(tokenizer_op)].item()
+        ans = [generated_output, score]
+        return ans
+    def create_and_run_prompt(self, model_name, protein_seq_1, protein_seq_2):
         model_holder = self.model_dict[model_name]
+        sample_inputs = {"protein_seq_1": protein_seq_1, "protein_seq_2": protein_seq_2}
         sample_dict = self.crate_sample_dict(
             sample_inputs=sample_inputs, model_holder=model_holder
         )
         prompt = sample_dict[ENCODER_INPUTS_STR]
         batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
+        res = prompt, *self.decode_output(
+            batch_dict, tokenizer_op=model_holder.tokenizer_op
+        )
         return res
     def create_demo(self, model_name_widget: gr.component):
         with gr.Group() as demo:
             gr.Markdown(self.markup_text)
             with gr.Row():
+                protein_seq_1 = gr.Textbox(
                     label="Protein 1 sequence",
                     # info="standard",
                     interactive=True,
                     lines=3,
                     value=self.examples["protein_calmodulin"],
                 )
+                protein_seq_2 = gr.Textbox(
                     label="Protein 2 sequence",
                     # info="standard",
                     interactive=True,
                 score_box = gr.Number(label="PPI score")
                 run_mammal.click(
                     fn=self.create_and_run_prompt,
+                    inputs=[model_name_widget, protein_seq_1, protein_seq_2],
                     outputs=[prompt_box, decoded, score_box],
                 )
             with gr.Row():

mammal_demo/ps_task.py CHANGED Viewed

@@ -10,6 +10,9 @@ from mammal.model import Mammal
 from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
 class PsTask(MammalTask):
     def __init__(self, model_dict):
@@ -34,7 +37,7 @@ Given the protein sequence, estimate if it's water-soluble.
            dict: sample_dict for feeding into model
         """
         sample_dict = dict(sample_inputs)  # shallow copy
-        sample_dict = ProteinSolubilityTask.data_preprocessing(
             sample_dict=sample_dict,
             protein_sequence_key="protein_seq",
             tokenizer_op=model_holder.tokenizer_op,
@@ -57,7 +60,7 @@ Given the protein sequence, estimate if it's water-soluble.
         """
         Extract predicted class and scores
         """
-        ans_dict = ProteinSolubilityTask.process_model_output(
             tokenizer_op=tokenizer_op,
             decoder_output=batch_dict[CLS_PRED][0],
             decoder_output_scores=batch_dict[SCORES][0],
@@ -72,11 +75,11 @@ Given the protein sequence, estimate if it's water-soluble.
     def create_and_run_prompt(self, model_name, protein_seq):
         model_holder = self.model_dict[model_name]
-        inputs = {
             "protein_seq": protein_seq,
         }
         sample_dict = self.crate_sample_dict(
-            sample_inputs=inputs, model_holder=model_holder
         )
         prompt = sample_dict[ENCODER_INPUTS_STR]
         batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)

 from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
+data_preprocessing = ProteinSolubilityTask.data_preprocessing
+process_model_output = ProteinSolubilityTask.process_model_output
 class PsTask(MammalTask):
     def __init__(self, model_dict):
            dict: sample_dict for feeding into model
         """
         sample_dict = dict(sample_inputs)  # shallow copy
+        sample_dict = data_preprocessing(
             sample_dict=sample_dict,
             protein_sequence_key="protein_seq",
             tokenizer_op=model_holder.tokenizer_op,
         """
         Extract predicted class and scores
         """
+        ans_dict = process_model_output(
             tokenizer_op=tokenizer_op,
             decoder_output=batch_dict[CLS_PRED][0],
             decoder_output_scores=batch_dict[SCORES][0],
     def create_and_run_prompt(self, model_name, protein_seq):
         model_holder = self.model_dict[model_name]
+        sample_inputs = {
             "protein_seq": protein_seq,
         }
         sample_dict = self.crate_sample_dict(
+            sample_inputs=sample_inputs, model_holder=model_holder
         )
         prompt = sample_dict[ENCODER_INPUTS_STR]
         batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)

mammal_demo/tcr_task.py CHANGED Viewed

@@ -29,7 +29,7 @@ class TcrTask(MammalTask):
 Given a TCR beta chain and epitope amino acid sequences, estimate the binding affinity score.
 """
-    def create_prompt(self, tcr_beta_seq, epitope_seq):
         prompt = (
             "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
             + f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_TCR_BETA_VDJ><SEQUENCE_NATURAL_START>{tcr_beta_seq}<SEQUENCE_NATURAL_END>"
@@ -48,20 +48,21 @@ Given a TCR beta chain and epitope amino acid sequences, estimate the binding af
            dict: sample_dict for feeding into model
         """
         sample_dict = dict()
-        sample_dict[ENCODER_INPUTS_STR] = self.create_prompt(**sample_inputs)
-        tokenizer_op = model_holder.tokenizer_op
-        model = model_holder.model
-        tokenizer_op(
             sample_dict=sample_dict,
             key_in=ENCODER_INPUTS_STR,
             key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
             key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
         )
         sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
-            sample_dict[ENCODER_INPUTS_TOKENS], device=model.device
         )
         sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
-            sample_dict[ENCODER_INPUTS_ATTENTION_MASK], device=model.device
         )
         return sample_dict
@@ -76,47 +77,26 @@ Given a TCR beta chain and epitope amino acid sequences, estimate the binding af
         )
         return batch_dict
-    @staticmethod
-    def positive_token_id(tokenizer_op: ModularTokenizerOp):
-        """token for positive binding
-        Args:
-            model (MammalTrainedModel): model holding tokenizer
-        Returns:
-            int: id of positive binding token
-        """
-        return tokenizer_op.get_token_id("<1>")
-    @staticmethod
-    def negative_token_id(tokenizer_op: ModularTokenizerOp):
-        """token for negative binding
-        Args:
-            model (MammalTrainedModel): model holding tokenizer
-        Returns:
-            int: id of negative binding token
-        """
-        return tokenizer_op.get_token_id("<0>")
     def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
         """
         Extract predicted class and scores
         """
-        # positive_token_id = self.positive_token_id(tokenizer_op)
-        # negative_token_id = self.negative_token_id(tokenizer_op)
-        negative_token_id = tokenizer_op.get_token_id("<0>")
-        positive_token_id = tokenizer_op.get_token_id("<1>")
         label_id_to_int = {
-            negative_token_id: 0,
-            positive_token_id: 1,
         }
         classification_position = 1
         decoder_output = batch_dict[CLS_PRED][0]
         decoder_output_scores = batch_dict[SCORES][0]
@@ -126,7 +106,7 @@ Given a TCR beta chain and epitope amino acid sequences, estimate the binding af
             scores = [None]
         ans = [
-            tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0]),
             label_id_to_int.get(int(decoder_output[classification_position]), -1),
             scores.item(),
         ]

 Given a TCR beta chain and epitope amino acid sequences, estimate the binding affinity score.
 """
+    def generate_prompt(self, tcr_beta_seq, epitope_seq):
         prompt = (
             "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
             + f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_TCR_BETA_VDJ><SEQUENCE_NATURAL_START>{tcr_beta_seq}<SEQUENCE_NATURAL_END>"
            dict: sample_dict for feeding into model
         """
         sample_dict = dict()
+        prompt = self.generate_prompt(**sample_inputs)
+        sample_dict[ENCODER_INPUTS_STR] = prompt
+        # Tokenize
+        sample_dict = model_holder.tokenizer_op(
             sample_dict=sample_dict,
             key_in=ENCODER_INPUTS_STR,
             key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
             key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
         )
         sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
+            sample_dict[ENCODER_INPUTS_TOKENS]
         )
         sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
+            sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
         )
         return sample_dict
         )
         return batch_dict
     def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
         """
         Extract predicted class and scores
         """
+        positive_token_id = self.positive_token_id(tokenizer_op)
+        negative_token_id = self.negative_token_id(tokenizer_op)
+        # negative_token_id = tokenizer_op.get_token_id("<0>")
+        # positive_token_id = tokenizer_op.get_token_id("<1>")
         label_id_to_int = {
+            negative_token_id: "negative",
+            positive_token_id: "positive",
         }
         classification_position = 1
+        # Get output
+        generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
         decoder_output = batch_dict[CLS_PRED][0]
         decoder_output_scores = batch_dict[SCORES][0]
             scores = [None]
         ans = [
+            generated_output,
             label_id_to_int.get(int(decoder_output[classification_position]), -1),
             scores.item(),
         ]