biomed-multi-alignment

Sleeping

App Files Files Community

matanninio commited on 20 days ago

Commit

fda141d

•

1 Parent(s): 49831fb

refactoring to make code more elegant and cleanups

Browse files

Files changed (4) hide show

app.py +36 -38
mammal_demo/demo_framework.py +28 -2
mammal_demo/ps_task.py +18 -19
mammal_demo/tcr_task.py +28 -38

app.py CHANGED Viewed

@@ -1,57 +1,47 @@
 import gradio as gr
-from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
 from mammal_demo.dti_task import DtiTask
 from mammal_demo.ppi_task import PpiTask
-from mammal_demo.tcr_task import TcrTask
 from mammal_demo.ps_task import PsTask
-all_tasks: dict[str, MammalTask] = dict()
-all_models: dict[str, MammalObjectBroker] = dict()
 # first create the required tasks
 # Note that the tasks need access to the models, as the model to use depends on the state of the widget
 # we pass the all_models dict and update it when we actualy have the models.
-ppi_task = PpiTask(model_dict=all_models)
-all_tasks[ppi_task.name] = ppi_task
-tdi_task = DtiTask(model_dict=all_models)
-all_tasks[tdi_task.name] = tdi_task
-tcr_task = TcrTask(model_dict=all_models)
-all_tasks[tcr_task.name] = tcr_task
-ps_task = PsTask(model_dict=all_models)
-all_tasks[ps_task.name] = ps_task
 # create the model holders. hold the model and the tokenizer, lazy download
 # note that the list of relevent tasks needs to be stated.
-ppi_model = MammalObjectBroker(
-    model_path="ibm/biomed.omics.bl.sm.ma-ted-458m", task_list=[ppi_task.name,tcr_task.name]
-)
-all_models[ppi_model.name] = ppi_model
-tdi_model = MammalObjectBroker(
     model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd",
-task_list=[tdi_task.name],
 )
-all_models[tdi_model.name] = tdi_model
-tcr_model = MammalObjectBroker(
-    model_path= "ibm/biomed.omics.bl.sm.ma-ted-458m.tcr_epitope_bind",
-    task_list=[tcr_task.name]
 )
-all_models[tcr_model.name] = tcr_model
-ps_model = MammalObjectBroker(
     model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.protein_solubility",
-    task_list=[ps_task.name]
 )
-all_models[ps_model.name] = ps_model
 def create_application():
     def task_change(value):
@@ -62,13 +52,18 @@ def create_application():
             if value in model.tasks
         ]
         if choices:
-            return (gr.update(choices=choices, value=choices[0], visible=True), *visibility)
         else:
             return (gr.skip, *visibility)
         # return model_name_dropdown
     with gr.Blocks() as application:
-        task_dropdown = gr.Dropdown(choices=["Select task"] + list(all_tasks.keys()), label="Mammal Task")
         task_dropdown.interactive = True
         model_name_dropdown = gr.Dropdown(
             choices=[
@@ -85,7 +80,10 @@ def create_application():
             task_change,
             inputs=[task_dropdown],
             outputs=[model_name_dropdown]
-            + [all_tasks[task].demo(model_name_widgit=model_name_dropdown) for task in all_tasks],
         )
         # def set_demo_vis(main_text):

 import gradio as gr
+from mammal_demo.demo_framework import (
+    ModelRegistry,
+    TaskRegistry,
+)
 from mammal_demo.dti_task import DtiTask
 from mammal_demo.ppi_task import PpiTask
 from mammal_demo.ps_task import PsTask
+from mammal_demo.tcr_task import TcrTask
+all_tasks = TaskRegistry()
+all_models = ModelRegistry()
 # first create the required tasks
 # Note that the tasks need access to the models, as the model to use depends on the state of the widget
 # we pass the all_models dict and update it when we actualy have the models.
+ppi_task = all_tasks.register_task(PpiTask(model_dict=all_models))
+tdi_task = all_tasks.register_task(DtiTask(model_dict=all_models))
+tcr_task = all_tasks.register_task(TcrTask(model_dict=all_models))
+ps_task = all_tasks.register_task(PsTask(model_dict=all_models))
 # create the model holders. hold the model and the tokenizer, lazy download
 # note that the list of relevent tasks needs to be stated.
+all_models.register_model(
     model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd",
+    task_list=[tdi_task],
 )
+all_models.register_model(
+    model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.tcr_epitope_bind",
+    task_list=[tcr_task],
 )
+all_models.register_model(
     model_path="ibm/biomed.omics.bl.sm.ma-ted-458m.protein_solubility",
+    task_list=[ps_task],
 )
+all_models.register_model(
+    model_path="ibm/biomed.omics.bl.sm.ma-ted-458m",
+    task_list=[ppi_task, tcr_task],
+)
+all_models.register_model("https://huggingface.co/ibm/biomed.omics.bl.sm.ma-ted-458m.moleculenet_clintox_tox")
+all_models.register_model("https://huggingface.co/ibm/biomed.omics.bl.sm.ma-ted-458m.moleculenet_clintox_fda")
+all_models.register_model("https://huggingface.co/ibm/biomed.omics.bl.sm.ma-ted-458m.moleculenet_bbbp")
 def create_application():
     def task_change(value):
             if value in model.tasks
         ]
         if choices:
+            return (
+                gr.update(choices=choices, value=choices[0], visible=True),
+                *visibility,
+            )
         else:
             return (gr.skip, *visibility)
         # return model_name_dropdown
     with gr.Blocks() as application:
+        task_dropdown = gr.Dropdown(
+            choices=["Select task"] + list(all_tasks.keys()), label="Mammal Task"
+        )
         task_dropdown.interactive = True
         model_name_dropdown = gr.Dropdown(
             choices=[
             task_change,
             inputs=[task_dropdown],
             outputs=[model_name_dropdown]
+            + [
+                all_tasks[task].demo(model_name_widgit=model_name_dropdown)
+                for task in all_tasks
+            ],
         )
         # def set_demo_vis(main_text):

mammal_demo/demo_framework.py CHANGED Viewed

@@ -90,15 +90,41 @@ class MammalTask(ABC):
     def demo(self, model_name_widgit: gr.component = None):
         if self._demo is None:
-            model_name_widget: gr.component
             self._demo = self.create_demo(model_name_widget=model_name_widgit)
         return self._demo
     @abstractmethod
-    def decode_output(self, batch_dict, model: Mammal):
         raise NotImplementedError()
     # self._setup()
     # def _setup(self):
     #     pass

     def demo(self, model_name_widgit: gr.component = None):
         if self._demo is None:
             self._demo = self.create_demo(model_name_widget=model_name_widgit)
         return self._demo
     @abstractmethod
+    def decode_output(self, batch_dict, model: Mammal) -> list:
         raise NotImplementedError()
     # self._setup()
     # def _setup(self):
     #     pass
+class TaskRegistry(dict[str, MammalTask]):
+    """just a dictionary with a register method"""
+    def register_task(self, task: MammalTask):
+        self[task.name] = task
+        return task.name
+class ModelRegistry(dict[str, MammalObjectBroker]):
+    """just a dictionary with a register models"""
+    def register_model(self, model_path, task_list=None, name=None):
+        """register a model and return the name of the model
+        Args:
+            model_path (_type_): _description_
+            name (optional str): explicit name for the model
+        Returns:
+            str: model name
+        """
+        model_holder = MammalObjectBroker(
+            model_path=model_path, task_list=task_list, name=name
+        )
+        self[model_holder.name] = model_holder
+        return model_holder.name

mammal_demo/ps_task.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import gradio as gr
-import torch
 from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
 from mammal.examples.protein_solubility.task import ProteinSolubilityTask
 from mammal.keys import (
-    ENCODER_INPUTS_STR,
     CLS_PRED,
     SCORES,
 )
 from mammal.model import Mammal
@@ -25,8 +24,6 @@ class PsTask(MammalTask):
 Given the protein sequance, estimate if it's soluble or insoluble.
 """
     def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
         """convert sample_inputs to sample_dict including creating a proper prompt
@@ -36,12 +33,12 @@ Given the protein sequance, estimate if it's soluble or insoluble.
         Returns:
            dict: sample_dict for feeding into model
         """
-        sample_dict = dict(sample_inputs) # shallow copy
         sample_dict = ProteinSolubilityTask.data_preprocessing(
-        sample_dict=sample_dict,
-        protein_sequence_key="protein_seq",
-        tokenizer_op=model_holder.tokenizer_op,
-        device=model_holder.model.device,
         )
         return sample_dict
@@ -56,8 +53,7 @@ Given the protein sequance, estimate if it's soluble or insoluble.
         )
         return batch_dict
-    def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp)-> dict:
         """
         Extract predicted class and scores
         """
@@ -71,11 +67,9 @@ Given the protein sequance, estimate if it's soluble or insoluble.
             ans_dict["pred"],
             ans_dict["not_normalized_scores"].item(),
             ans_dict["normalized_scores"].item(),
-        ]
         return ans
     def create_and_run_prompt(self, model_name, protein_seq):
         model_holder = self.model_dict[model_name]
         inputs = {
@@ -86,14 +80,13 @@ Given the protein sequance, estimate if it's soluble or insoluble.
         )
         prompt = sample_dict[ENCODER_INPUTS_STR]
         batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
-        res = prompt, *self.decode_output(batch_dict, tokenizer_op=model_holder.tokenizer_op)
         return res
     def create_demo(self, model_name_widget):
         with gr.Group() as demo:
             gr.Markdown(self.markup_text)
             with gr.Row():
@@ -121,7 +114,13 @@ Given the protein sequance, estimate if it's soluble or insoluble.
                 run_mammal.click(
                     fn=self.create_and_run_prompt,
                     inputs=[model_name_widget, protein_textbox],
-                    outputs=[prompt_box, decoded, predicted_class,non_norm_score,norm_score],
                 )
             demo.visible = False
             return demo

 import gradio as gr
 from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
 from mammal.examples.protein_solubility.task import ProteinSolubilityTask
 from mammal.keys import (
     CLS_PRED,
+    ENCODER_INPUTS_STR,
     SCORES,
 )
 from mammal.model import Mammal
 Given the protein sequance, estimate if it's soluble or insoluble.
 """
     def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
         """convert sample_inputs to sample_dict including creating a proper prompt
         Returns:
            dict: sample_dict for feeding into model
         """
+        sample_dict = dict(sample_inputs)  # shallow copy
         sample_dict = ProteinSolubilityTask.data_preprocessing(
+            sample_dict=sample_dict,
+            protein_sequence_key="protein_seq",
+            tokenizer_op=model_holder.tokenizer_op,
+            device=model_holder.model.device,
         )
         return sample_dict
         )
         return batch_dict
+    def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
         """
         Extract predicted class and scores
         """
             ans_dict["pred"],
             ans_dict["not_normalized_scores"].item(),
             ans_dict["normalized_scores"].item(),
+        ]
         return ans
     def create_and_run_prompt(self, model_name, protein_seq):
         model_holder = self.model_dict[model_name]
         inputs = {
         )
         prompt = sample_dict[ENCODER_INPUTS_STR]
         batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
+        res = prompt, *self.decode_output(
+            batch_dict, tokenizer_op=model_holder.tokenizer_op
+        )
         return res
     def create_demo(self, model_name_widget):
         with gr.Group() as demo:
             gr.Markdown(self.markup_text)
             with gr.Row():
                 run_mammal.click(
                     fn=self.create_and_run_prompt,
                     inputs=[model_name_widget, protein_textbox],
+                    outputs=[
+                        prompt_box,
+                        decoded,
+                        predicted_class,
+                        non_norm_score,
+                        norm_score,
+                    ],
                 )
             demo.visible = False
             return demo

mammal_demo/tcr_task.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import gradio as gr
 import torch
 from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
-from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
 from mammal.keys import (
     ENCODER_INPUTS_STR,
     ENCODER_INPUTS_TOKENS,
-    ENCODER_INPUTS_ATTENTION_MASK,
-    CLS_PRED,
     SCORES,
 )
 from mammal.model import Mammal
@@ -16,10 +15,12 @@ from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
 class TcrTask(MammalTask):
     def __init__(self, model_dict):
-        super().__init__(name="T-cell receptors-peptide binding specificity", model_dict=model_dict)
         self.description = "T-cell receptors-peptide binding specificity (TCR)"
         self.examples = {
-            "tcr_beta_seq":  "NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSYSWDRVLEQYFGPGTRLTVT",
             "epitope_seq": "LLQTGIHVRVSQPSL",
         }
         self.markup_text = """
@@ -28,20 +29,14 @@ class TcrTask(MammalTask):
 Given the TCR beta sequance and the epitope sequacne, estimate the binding specificity.
 """
-    def create_prompt(self,tcr_beta_seq, epitope_seq):
         prompt = (
-            "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"+
-            f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_TCR_BETA_VDJ><SEQUENCE_NATURAL_START>{tcr_beta_seq}<SEQUENCE_NATURAL_END>"+
-            f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_EPITOPE><SEQUENCE_NATURAL_START>{epitope_seq}<SEQUENCE_NATURAL_END><EOS>"
         )
-        return prompt
     def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
         """convert sample_inputs to sample_dict including creating a proper prompt
@@ -52,15 +47,15 @@ Given the TCR beta sequance and the epitope sequacne, estimate the binding speci
         Returns:
            dict: sample_dict for feeding into model
         """
-        sample_dict= dict()
         sample_dict[ENCODER_INPUTS_STR] = self.create_prompt(**sample_inputs)
         tokenizer_op = model_holder.tokenizer_op
         model = model_holder.model
         tokenizer_op(
-        sample_dict=sample_dict,
-        key_in=ENCODER_INPUTS_STR,
-        key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
-        key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
         )
         sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
             sample_dict[ENCODER_INPUTS_TOKENS], device=model.device
@@ -92,7 +87,7 @@ Given the TCR beta sequance and the epitope sequacne, estimate the binding speci
             int: id of positive binding token
         """
         return tokenizer_op.get_token_id("<1>")
     @staticmethod
     def negative_token_id(tokenizer_op: ModularTokenizerOp):
         """token for negative binding
@@ -105,15 +100,14 @@ Given the TCR beta sequance and the epitope sequacne, estimate the binding speci
         """
         return tokenizer_op.get_token_id("<0>")
-    def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp)-> dict:
         """
         Extract predicted class and scores
         """
         # positive_token_id = self.positive_token_id(tokenizer_op)
         # negative_token_id = self.negative_token_id(tokenizer_op)
         negative_token_id = tokenizer_op.get_token_id("<0>")
         positive_token_id = tokenizer_op.get_token_id("<1>")
@@ -123,14 +117,13 @@ Given the TCR beta sequance and the epitope sequacne, estimate the binding speci
         }
         classification_position = 1
-        decoder_output=batch_dict[CLS_PRED][0]
-        decoder_output_scores=batch_dict[SCORES][0]
         if decoder_output_scores is not None:
-            scores = decoder_output_scores[classification_position,positive_token_id]
         else:
-            scores=[None]
         ans = [
             tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0]),
@@ -139,8 +132,6 @@ Given the TCR beta sequance and the epitope sequacne, estimate the binding speci
         ]
         return ans
     def create_and_run_prompt(self, model_name, tcr_beta_seq, epitope_seq):
         model_holder = self.model_dict[model_name]
         inputs = {
@@ -152,14 +143,13 @@ Given the TCR beta sequance and the epitope sequacne, estimate the binding speci
         )
         prompt = sample_dict[ENCODER_INPUTS_STR]
         batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
-        res = prompt, *self.decode_output(batch_dict, tokenizer_op=model_holder.tokenizer_op)
         return res
     def create_demo(self, model_name_widget):
         with gr.Group() as demo:
             gr.Markdown(self.markup_text)
             with gr.Row():
@@ -192,7 +182,7 @@ Given the TCR beta sequance and the epitope sequacne, estimate the binding speci
                 run_mammal.click(
                     fn=self.create_and_run_prompt,
                     inputs=[model_name_widget, tcr_textbox, epitope_textbox],
-                    outputs=[prompt_box, decoded, predicted_class,binding_score],
                 )
             demo.visible = False
             return demo

 import gradio as gr
 import torch
 from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
 from mammal.keys import (
+    CLS_PRED,
+    ENCODER_INPUTS_ATTENTION_MASK,
     ENCODER_INPUTS_STR,
     ENCODER_INPUTS_TOKENS,
     SCORES,
 )
 from mammal.model import Mammal
 class TcrTask(MammalTask):
     def __init__(self, model_dict):
+        super().__init__(
+            name="T-cell receptors-peptide binding specificity", model_dict=model_dict
+        )
         self.description = "T-cell receptors-peptide binding specificity (TCR)"
         self.examples = {
+            "tcr_beta_seq": "NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIHYSVGAGITDQGEVPNGYNVSRSTTEDFPLRLLSAAPSQTSVYFCASSYSWDRVLEQYFGPGTRLTVT",
             "epitope_seq": "LLQTGIHVRVSQPSL",
         }
         self.markup_text = """
 Given the TCR beta sequance and the epitope sequacne, estimate the binding specificity.
 """
+    def create_prompt(self, tcr_beta_seq, epitope_seq):
         prompt = (
+            "<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
+            + f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_TCR_BETA_VDJ><SEQUENCE_NATURAL_START>{tcr_beta_seq}<SEQUENCE_NATURAL_END>"
+            + f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_EPITOPE><SEQUENCE_NATURAL_START>{epitope_seq}<SEQUENCE_NATURAL_END><EOS>"
         )
+        return prompt
     def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker):
         """convert sample_inputs to sample_dict including creating a proper prompt
         Returns:
            dict: sample_dict for feeding into model
         """
+        sample_dict = dict()
         sample_dict[ENCODER_INPUTS_STR] = self.create_prompt(**sample_inputs)
         tokenizer_op = model_holder.tokenizer_op
         model = model_holder.model
         tokenizer_op(
+            sample_dict=sample_dict,
+            key_in=ENCODER_INPUTS_STR,
+            key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
+            key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
         )
         sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
             sample_dict[ENCODER_INPUTS_TOKENS], device=model.device
             int: id of positive binding token
         """
         return tokenizer_op.get_token_id("<1>")
     @staticmethod
     def negative_token_id(tokenizer_op: ModularTokenizerOp):
         """token for negative binding
         """
         return tokenizer_op.get_token_id("<0>")
+    def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
         """
         Extract predicted class and scores
         """
         # positive_token_id = self.positive_token_id(tokenizer_op)
         # negative_token_id = self.negative_token_id(tokenizer_op)
         negative_token_id = tokenizer_op.get_token_id("<0>")
         positive_token_id = tokenizer_op.get_token_id("<1>")
         }
         classification_position = 1
+        decoder_output = batch_dict[CLS_PRED][0]
+        decoder_output_scores = batch_dict[SCORES][0]
         if decoder_output_scores is not None:
+            scores = decoder_output_scores[classification_position, positive_token_id]
         else:
+            scores = [None]
         ans = [
             tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0]),
         ]
         return ans
     def create_and_run_prompt(self, model_name, tcr_beta_seq, epitope_seq):
         model_holder = self.model_dict[model_name]
         inputs = {
         )
         prompt = sample_dict[ENCODER_INPUTS_STR]
         batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
+        res = prompt, *self.decode_output(
+            batch_dict, tokenizer_op=model_holder.tokenizer_op
+        )
         return res
     def create_demo(self, model_name_widget):
         with gr.Group() as demo:
             gr.Markdown(self.markup_text)
             with gr.Row():
                 run_mammal.click(
                     fn=self.create_and_run_prompt,
                     inputs=[model_name_widget, tcr_textbox, epitope_textbox],
+                    outputs=[prompt_box, decoded, predicted_class, binding_score],
                 )
             demo.visible = False
             return demo