Spaces:

thaidaev
/

zsp

Running

App Files Files

Massimo G. Totaro commited on Dec 8, 2023

Commit

fba8f5e

•

1 Parent(s): 82caf01

update fix

Browse files

Files changed (8) hide show

.gitignore +3 -1
LICENSE +11 -0
README.md +2 -2
app.py +90 -19
data.py +169 -40
instructions.md +39 -13
model.py +74 -47
requirements.txt +1 -1

.gitignore CHANGED Viewed

	@@ -1 +1,3 @@
1	- */

+Dockerfile
+*.ipynb
+*/

LICENSE ADDED Viewed

	@@ -0,0 +1,11 @@

+Copyright (c) 2021, Massimo G. Totaro All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md CHANGED Viewed

@@ -4,10 +4,10 @@ emoji: 📈
 colorFrom: gray
 colorTo: red
 sdk: gradio
-sdk_version: 3.33.1
 app_file: app.py
 pinned: false
-license: eupl-1.1
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: gray
 colorTo: red
 sdk: gradio
+sdk_version: 4.8.0
 app_file: app.py
 pinned: false
+license: bsd-2-clause
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,26 +1,97 @@
-from model import MODELS
-from data import Data
-import gradio as gr
 from tempfile import NamedTemporaryFile
-    # scoring strategies
-SCORING = ["masked-marginals (more accurate)", "wt-marginals (faster)"]
 def app(*argv):
-    seq, trg, model_name, scoring_strategy, out_file, *_ = argv
-    html = Data(seq, trg, model_name, scoring_strategy, out_file).calculate()
-    return html, gr.File.update(value=out_file.name, visible=True)
-with gr.Blocks() as demo, NamedTemporaryFile(mode='w+', prefix='out_', suffix='.csv') as out_file, open("instructions.md", "r") as md:
-    gr.Markdown(md.read())
-    seq = gr.Textbox(lines=2, label="Sequence", placeholder="Sequence here...", value='MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ')
-    trg = gr.Textbox(lines=1, label="Substitutions", placeholder="Substitutions here...", value="61 214 19 30 122 140")
-    model_name = gr.Dropdown(MODELS, label="Model", value=MODELS[1])
-    scoring_strategy = gr.Dropdown(SCORING, label="Scoring strategy", value=SCORING[1])
-    btn = gr.Button(value="Run")
-    out = gr.HTML()
-    bto = gr.File(value=out_file.name, visible=False, label="Download", file_count='single', interactive=False)
-    btn.click(fn=app, inputs=[seq, trg, model_name, scoring_strategy, bto], outputs=[out, bto])
 if __name__ == "__main__":
-    demo.launch()

 from tempfile import NamedTemporaryFile
+from gradio import Blocks, Button, Checkbox, Dropdown, Examples, File, HTML, Markdown, Textbox
+from model import get_models
+from data import Data
+# Define scoring strategies
+SCORING = ["wt-marginals", "masked-marginals"]
+# Get available models
+MODELS = get_models()
 def app(*argv):
+    """
+    Main application function
+    """
+    # Unpack the arguments
+    seq, trg, model_name, *_ = argv
+    scoring = SCORING[scoring_strategy.value]
+    try:
+        # Calculate the data based on the input parameters
+        data = Data(seq, trg, model_name, scoring, out_file).calculate()
+    except Exception as e:
+        # If an error occurs, return an HTML error message
+        return f'<!DOCTYPE html><html><body><h1 style="background-color:#F70D1A;text-align:center;">Error: {str(e)}</h1></body></html>', None
+    # If no error occurs, return the calculated data
+    return repr(data), File(value=out_file.name, visible=True)
+# Create the Gradio interface
+with open("instructions.md", "r", encoding="utf-8") as md,\
+     NamedTemporaryFile(mode='w+')                  as out_file,\
+     Blocks()                                       as esm_scan:
+    # Define the interface components
+    Markdown(md.read())
+    seq = Textbox(
+        lines=2,
+        label="Sequence",
+        placeholder="FASTA sequence here...",
+        value=''
+    )
+    trg = Textbox(
+        lines=1,
+        label="Substitutions",
+        placeholder="Substitutions here...",
+        value=""
+    )
+    model_name = Dropdown(MODELS, label="Model", value="facebook/esm2_t30_150M_UR50D")
+    scoring_strategy = Checkbox(value=True, label="Use masked-marginals scoring")
+    btn = Button(value="Run")
+    out = HTML()
+    bto = File(
+        value=out_file.name,
+        visible=False,
+        label="Download",
+        file_count='single',
+        interactive=False
+    )
+    btn.click(
+        fn=app,
+        inputs=[seq, trg, model_name],
+        outputs=[out, bto]
+    )
+    ex = Examples(
+        examples=[
+            [
+                "MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ",
+                "deep mutational scanning",
+                "facebook/esm2_t6_8M_UR50D"
+             ],
+            [
+                "MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ",
+                "217 218 219",
+                "facebook/esm2_t12_35M_UR50D"
+             ],
+            [
+                "MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ",
+                "R218K R218S R218N R218A R218V R218D",
+                "facebook/esm2_t30_150M_UR50D",
+             ],
+            [
+                "MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ",
+                "MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMWGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ",
+                "facebook/esm2_t33_650M_UR50D",
+             ],
+        ],
+        inputs=[seq,
+                trg,
+                model_name],
+        outputs=[out,
+                 bto],
+        fn=app
+    )
+# Launch the Gradio interface
 if __name__ == "__main__":
+    esm_scan.launch()

data.py CHANGED Viewed

@@ -1,80 +1,209 @@
 from model import Model
 import pandas as pd
-from re import match
 class Data:
     """Container for input and output data"""
-        # initialise empty model as static class member for efficiency
     model = Model()
-    def parse_seq(self, src:str):
-        "parse input sequence"
-        self.seq = src.strip().upper()
-        if not all(x in self.model.alphabet for x in src):
             raise RuntimeError("Unrecognised characters in sequence")
-    def parse_sub(self, trg:str):
-        "parse input substitutions"
         self.mode = None
         self.sub = list()
         self.trg = trg.strip().upper()
-            # identify running mode
-        if len(self.trg.split()) == 1 and len(self.trg.split()[0]) == len(self.seq):    # if single string of same length as sequence, seq vs seq mode
-            self.mode = 'SVS'
-            for resi,(src,trg) in enumerate(zip(self.seq, self.trg), 1):
                 if src != trg:
                     self.sub.append(f"{src}{resi}{trg}")
         else:
             self.trg = self.trg.split()
-            if all(match(r'\d+', x) for x in self.trg):                                 # if all strings are numbers, deep mutational scanning mode
                 self.mode = 'DMS'
                 for resi in map(int, self.trg):
                     src = self.seq[resi-1]
-                    for trg in "ACDEFGHIKLMNPQRSTVWY".replace(src,''):
                         self.sub.append(f"{src}{resi}{trg}")
-            elif all(match(r'[A-Z]\d+[A-Z]', x) for x in self.trg):                     # if all strings are of the form X#Y, single substitution mode
                 self.mode = 'MUT'
                 self.sub = self.trg
             else:
-                raise RuntimeError("Unrecognised running mode; wrong inputs?")
         self.sub = pd.DataFrame(self.sub, columns=['0'])
-    def __init__(self, src:str, trg:str, model_name:str, scoring_strategy:str, out_file):
         "initialise data"
             # if model has changed, load new model
         if self.model.model_name != model_name:
             self.model_name = model_name
             self.model = Model(model_name)
         self.parse_seq(src)
         self.parse_sub(trg)
         self.scoring_strategy = scoring_strategy
         self.out = pd.DataFrame(self.sub, columns=['0', self.model_name])
-        self.out_buffer = out_file.name
-    def parse_output(self) -> str:
         "format output data for visualisation"
-        if self.mode == 'MUT':      # if single substitution mode, sort by score
-            self.out = self.out.sort_values(self.model_name, ascending=False)
-        elif self.mode == 'DMS':    # if deep mutational scanning mode, sort by residue and score
-            self.out = pd.concat([(self.out.assign(resi=self.out['0'].str.extract(r'(\d+)', expand=False).astype(int))      # FIX: this doesn't work if there's jolly characters in the input sequence
-                                    .sort_values(['resi', self.model_name], ascending=[True,False])
-                                    .groupby(['resi'])
-                                    .head(19)
-                                    .drop(['resi'], axis=1)).iloc[19*x:19*(x+1)]
-                                                            .reset_index(drop=True) for x in range(self.out.shape[0]//19)]
-                                , axis=1).set_axis(range(self.out.shape[0]//19*2), axis='columns')
-            # save to temporary file to be downloaded
-        self.out.round(2).to_csv(self.out_buffer, index=False, header=False)
-        return (self.out.style
-                        .format(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
-                        .hide(axis=0)
-                        .hide(axis=1)
-                        .background_gradient(cmap="RdYlGn", vmax=8, vmin=-8)
-                        .to_html(justify='center'))
     def calculate(self):
         "run model and parse output"
         self.model.run_model(self)
-        return self.parse_output()

+from math import ceil
+from re import match
+import seaborn as sns
 from model import Model
+import matplotlib.pyplot as plt
 import pandas as pd
+import seaborn as sns
+from model import Model
 class Data:
     """Container for input and output data"""
+    # Initialise empty model as static class member for efficiency
     model = Model()
+    def parse_seq(self, src: str):
+        """Parse input sequence"""
+        self.seq = src.strip().upper().replace('\n', '')
+        if not all(x in self.model.alphabet for x in self.seq):
             raise RuntimeError("Unrecognised characters in sequence")
+    def parse_sub(self, trg: str):
+        """Parse input substitutions"""
         self.mode = None
         self.sub = list()
         self.trg = trg.strip().upper()
+        self.resi = list()
+        # Identify running mode
+        if len(self.trg.split()) == 1 and len(self.trg.split()[0]) == len(self.seq) and all(match(r'\w+', x) for x in self.trg):
+            # If single string of same length as sequence, seq vs seq mode
+            self.mode = 'MUT'
+            for resi, (src, trg) in enumerate(zip(self.seq, self.trg), 1):
                 if src != trg:
                     self.sub.append(f"{src}{resi}{trg}")
+                    self.resi.append(resi)
         else:
             self.trg = self.trg.split()
+            if all(match(r'\d+', x) for x in self.trg):
+                # If all strings are numbers, deep mutational scanning mode
                 self.mode = 'DMS'
                 for resi in map(int, self.trg):
                     src = self.seq[resi-1]
+                    for trg in "ACDEFGHIKLMNPQRSTVWY".replace(src, ''):
                         self.sub.append(f"{src}{resi}{trg}")
+                    self.resi.append(resi)
+            elif all(match(r'[A-Z]\d+[A-Z]', x) for x in self.trg):
+                # If all strings are of the form X#Y, single substitution mode
                 self.mode = 'MUT'
                 self.sub = self.trg
+                self.resi = [int(x[1:-1]) for x in self.trg]
+                for s, *resi, _ in self.trg:
+                    if self.seq[int(''.join(resi))-1] != s:
+                        raise RuntimeError(f"Unrecognised input substitution {self.seq[int(''.join(resi))]}{int(''.join(resi))} /= {s}{int(''.join(resi))}")
             else:
+                self.mode = 'TMS'
+                for resi, src in enumerate(self.seq, 1):
+                    for trg in "ACDEFGHIKLMNPQRSTVWY".replace(src, ''):
+                        self.sub.append(f"{src}{resi}{trg}")
+                    self.resi.append(resi)
         self.sub = pd.DataFrame(self.sub, columns=['0'])
+    def __init__(self, src:str, trg:str, model_name:str='facebook/esm2_t33_650M_UR50D', scoring_strategy:str='masked-marginals', out_file=None):
         "initialise data"
             # if model has changed, load new model
         if self.model.model_name != model_name:
             self.model_name = model_name
             self.model = Model(model_name)
         self.parse_seq(src)
+        self.offset = 0
         self.parse_sub(trg)
         self.scoring_strategy = scoring_strategy
+        self.token_probs = None
         self.out = pd.DataFrame(self.sub, columns=['0', self.model_name])
+        self.out_str = None
+        self.out_buffer = out_file.name if 'name' in dir(out_file) else out_file
+    def parse_output(self) -> None:
         "format output data for visualisation"
+        if self.mode == 'TMS':
+            self.process_tms_mode()
+        else:
+            if self.mode == 'DMS':
+                self.sort_by_residue_and_score()
+            elif self.mode == 'MUT':
+                self.sort_by_score()
+            else:
+                raise RuntimeError(f"Unrecognised mode {self.mode}")
+            if self.out_buffer:
+                self.out.round(2).to_csv(self.out_buffer, index=False, header=False)
+            self.out_str = (self.out.style
+                            .format(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
+                            .hide(axis=0)
+                            .hide(axis=1)
+                            .background_gradient(cmap="RdYlGn", vmax=8, vmin=-8)
+                            .to_html(justify='center'))
+    def sort_by_score(self):
+        self.out = self.out.sort_values(self.model_name, ascending=False)
+    def sort_by_residue_and_score(self):
+        self.out = (self.out.assign(resi=self.out['0'].str.extract(r'(\d+)', expand=False).astype(int))
+                            .sort_values(['resi', self.model_name], ascending=[True,False])
+                            .groupby(['resi'])
+                            .head(19)
+                            .drop(['resi'], axis=1))
+        self.out = pd.concat([self.out.iloc[19*x:19*(x+1)].reset_index(drop=True) for x in range(self.out.shape[0]//19)]
+                            , axis=1).set_axis(range(self.out.shape[0]//19*2), axis='columns')
+    def process_tms_mode(self):
+        self.out = self.assign_resi_and_group()
+        self.out = self.concat_and_set_axis()
+        self.out /= self.out.abs().max().max()
+        divs = self.calculate_divs()
+        ncols = min(divs, key=lambda x: abs(x-60))
+        nrows = ceil(self.out.shape[1]/ncols)
+        ncols = self.adjust_ncols(ncols, nrows)
+        self.plot_heatmap(ncols, nrows)
+    def assign_resi_and_group(self):
+        return (self.out.assign(resi=self.out['0'].str.extract(r'(\d+)', expand=False).astype(int))
+                        .groupby(['resi'])
+                        .head(19))
+    def concat_and_set_axis(self):
+        return (pd.concat([(self.out.iloc[19*x:19*(x+1)]
+                            .pipe(self.create_dataframe)
+                            .sort_values(['0'], ascending=[True])
+                            .drop(['resi', '0'], axis=1)
+                            .set_axis(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
+                                       'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])
+                            .astype(float)
+                            ) for x in range(self.out.shape[0]//19)]
+                        , axis=1)
+                        .set_axis([f'{a}{i}' for i, a in enumerate(self.seq, 1)], axis='columns'))
+    def create_dataframe(self, df):
+        return pd.concat([pd.Series([df.iloc[0, 0][:-1]+df.iloc[0, 0][0], 0, 0], index=df.columns).to_frame().T, df], axis=0, ignore_index=True)
+    def calculate_divs(self):
+        return [x for x in range(1, self.out.shape[1]+1) if self.out.shape[1] % x == 0 and 30 <= x and x <= 60] or [60]
+    def adjust_ncols(self, ncols, nrows):
+        while self.out.shape[1]/ncols < nrows and ncols > 45 and ncols*nrows >= self.out.shape[1]:
+            ncols -= 1
+        return ncols + 1
+    def plot_heatmap(self, ncols, nrows):
+        if nrows < 2:
+            self.plot_single_heatmap()
+        else:
+            self.plot_multiple_heatmaps(ncols, nrows)
+        if self.out_buffer:
+            plt.savefig(self.out_buffer, format='svg')
+            with open(self.out_buffer, 'r', encoding='utf-8') as f:
+                self.out_str = f.read()
+    def plot_single_heatmap(self):
+        fig = plt.figure(figsize=(12, 6))
+        sns.heatmap(self.out
+                  , cmap='RdBu'
+                  , cbar=False
+                  , square=True
+                  , xticklabels=1
+                  , yticklabels=1
+                  , center=0
+                  , annot=self.out.map(lambda x: ' ' if x != 0 else '·')
+                  , fmt='s'
+                  , annot_kws={'size': 'xx-large'})
+        fig.tight_layout()
+    def plot_multiple_heatmaps(self, ncols, nrows):
+        fig, ax = plt.subplots(nrows=nrows, figsize=(12, 6*nrows))
+        for i in range(nrows):
+            tmp = self.out.iloc[:,i*ncols:(i+1)*ncols]
+            label = tmp.map(lambda x: ' ' if x != 0 else '·')
+            sns.heatmap(tmp
+                      , ax=ax[i]
+                      , cmap='RdBu'
+                      , cbar=False
+                      , square=True
+                      , xticklabels=1
+                      , yticklabels=1
+                      , center=0
+                      , annot=label
+                      , fmt='s'
+                      , annot_kws={'size': 'xx-large'})
+            ax[i].set_yticklabels(ax[i].get_yticklabels(), rotation=0)
+            ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
+        fig.tight_layout()
     def calculate(self):
         "run model and parse output"
         self.model.run_model(self)
+        self.parse_output()
+        return self
+    def __str__(self):
+        "return output data in DataFrame format"
+        return str(self.out)
+    def __repr__(self):
+        "return output data in html format"
+        return self.out_str

instructions.md CHANGED Viewed

@@ -1,13 +1,39 @@
-# **ESM zero-shot variant prediction**
-this was inspired from this [paper](https://doi.org/10.1101/2021.07.09.450648) and adaptated from this [repo](https://github.com/facebookresearch/esm/tree/main/esm)
-#### **Instructions**
-- in the 'sequence' text box the protein full amino acid sequence that is to be analysed must be given, jolly charachters (e.g. -X.B) are supported (but at the moment the visualisation does not show the correct results)
-- there's three running modes that can be chosen, depending on the input in the 'substitution' box:
-  - if another sequence is given, the positions that are different between the two will be evaluated (NB the sequences must be of the same length) and their score returned
-  - if a list of integers is given, a deep mutational scan will be performed at those positions in the input sequence and the scores for the amino acids, different from the original one, will be returned
-  - if a single substitution or a list thereof is given (in the form of **B008S**), the single substitution score is returned
-- you can choose which ESM model to use for the calculations, these models are the ones that are available at runtime on Hugging Face Model Hub
-- there's 2 scoring strategies available: wt-marginals and masked marginals; the first one is faster, but less accurate, the second one considers the sequence context more thoroughly, but is sensibly slower (the run time scales linearly with sequence length)
-- the results will be shown in a table, with color coding and sorted by fitness (if performing a deep mutational scan)
-- the output data is available for download from the box at the bottom as a CSV file

+# **ESM-Scan**
+Calculate the <u>fitness of single amino acid substitutions</u> on proteins, using a [zero-shot](https://doi.org/10.1101/2021.07.09.450648) [language model predictor](https://github.com/facebookresearch/esm)
+  <details>
+    <summary> <b> USAGE INSTRUCTIONS </b> </summary>
+### **Setup**
+No setup is required, just fill the input boxes with the required data and click on the `Run` button.
+A list of examples can be found at the bottom of the page, click on them to autofill the fields.
+If the server is not used for some time, it will go into standby.
+Running a calculation resumes the tool from standby, the first run might take longer due to startup and model loading.
+### **Input**
+- write the protein full amino acid sequence to be analysed in the **Sequence** text box
+  jolly charachters (e.g. `-X.B`) can be inserted but, at the moment, visualisation cannot handle them
+- write the substitutions to test in the **Substitutions** box
+  there are three running modes that can be used, depending on the input:
+  + *single substitution* or list thereof (in the form of `R218K R218W`): the single substitution is scored
+  + *residue position* or list thereof: all possible substitutions will be evaluated
+  + *same-length sequence*: the differing amino acid substitutions will be evaluated, one by one
+  + any other *different input*: a deep mutational scan of the full sequence will be performed
+- the ESM model to use for the calculations can be chosen among those that are available on Hugging Face Model Hub;
+  `esm2_t33_650M_UR50D` offers the best expense-accuracy tradeoff[*](https://doi.org/10.1126/science.ade2574)
+- the `masked-marginals` scoring strategy considers sequence context at inference time, being slower but more accurate;
+  in case of long runtimes, you can tick the box off to speed the calculations up significantly, sacrificing accuracy
+- when running a deep mutational scan, it is recommended to use smaller models (8M, 35M, 150M parameters), since the runtime is significant, especially for longer sequences and the server might be overloaded;
+  over 30 min might be necessary for calculating a 300-residue-long sequence with larger models
+  in general, accuracy is influenced significantly by the scoring strategy and less so by the model size, so it is suggested to reduce the latter first when optimising for runtime;
+  the scoring strategy computational cost scales with the number of substitutions tested, while the model’s with the wild-type sequence length
+- it is possible to calculate the effect of multiple concurrent substitutions, but this has to be done manually, by changing the input sequence and running the calculation again
+### **Output**
+Your results will be shown in a color-coded table, except for the deep mutational scan which will yield a heatmap.
+The output data can be downloaded from the box at the bottom.
+File extensions are not supported by the server and need to be appended to the filenames after downloading:
+- `CSV` for tables
+- `SVG` for full-sequence deep mutational scan
+  </details>

model.py CHANGED Viewed

@@ -1,72 +1,99 @@
 from huggingface_hub import HfApi, ModelFilter
 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
-    # fetch suitable ESM models from HuggingFace Hub
-MODELS = [m.modelId for m in HfApi().list_models(filter=ModelFilter(author="facebook", model_name="esm", task="fill-mask"), sort="lastModified", direction=-1)]
-if not any(MODELS):
-    raise RuntimeError("Error while retrieving models from HuggingFace Hub")
 class Model:
-    """Wrapper for ESM models"""
-    def __init__(self, model_name:str=""):
-        "load selected model and tokenizer"
         self.model_name = model_name
         if model_name:
             self.model = AutoModelForMaskedLM.from_pretrained(model_name)
             self.batch_converter = AutoTokenizer.from_pretrained(model_name)
             self.alphabet = self.batch_converter.get_vocab()
             if torch.cuda.is_available():
                 self.model = self.model.cuda()
-    def __rshift__(self, batch_tokens:torch.Tensor) -> torch.Tensor:
-        "run model on batch of tokens"
-        return self.model(batch_tokens)["logits"]
-    def __lshift__(self, input:str) -> torch.Tensor:
-        "convert input string to batch of tokens"
-        return self.batch_converter(input, return_tensors="pt")["input_ids"]
-    def __getitem__(self, key:str) -> int:
-        "get token ID from character"
         return self.alphabet[key]
     def run_model(self, data):
-        "run model on data"
         def label_row(row, token_probs):
-            "label row with score"
             wt, idx, mt = row[0], int(row[1:-1])-1, row[-1]
             score = token_probs[0, 1+idx, self[mt]] - token_probs[0, 1+idx, self[wt]]
             return score.item()
-        batch_tokens = self<<data.seq
-            # run model with selected scoring strategy (info thereof available in the original ESM paper)
-        if data.scoring_strategy.startswith("wt-marginals"):
-            with torch.no_grad():
-                token_probs = torch.log_softmax(self>>batch_tokens, dim=-1)
-            data.out[self.model_name] = data.sub.apply(
-                lambda row: label_row(
-                    row['0'],
-                    token_probs,
-                ),
-                axis=1,
-            )
-        elif data.scoring_strategy.startswith("masked-marginals"):
             all_token_probs = []
             for i in range(batch_tokens.size()[1]):
-                batch_tokens_masked = batch_tokens.clone()
-                batch_tokens_masked[0, i] = self['<mask>']
-                with torch.no_grad():
-                    token_probs = torch.log_softmax(
-                        self>>batch_tokens_masked, dim=-1
-                    )
-                all_token_probs.append(token_probs[:, i])
             token_probs = torch.cat(all_token_probs, dim=0).unsqueeze(0)
-            data.out[self.model_name] = data.sub.apply(
-                lambda row: label_row(
-                    row['0'],
-                    token_probs,
-                ),
-                axis=1,
-            )

 from huggingface_hub import HfApi, ModelFilter
 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.modeling_outputs import MaskedLMOutput
+# Function to fetch suitable ESM models from HuggingFace Hub
+def get_models() -> list[None|str]:
+    """Fetch suitable ESM models from HuggingFace Hub."""
+    if not any(
+        out := [
+            m.modelId for m in HfApi().list_models(
+                filter=ModelFilter(
+                    author="facebook", model_name="esm", task="fill-mask"
+                ),
+                sort="lastModified",
+                direction=-1
+            )
+        ]
+    ):
+        raise RuntimeError("Error while retrieving models from HuggingFace Hub")
+    return out
+# Class to wrap ESM models
 class Model:
+    """Wrapper for ESM models."""
+    def __init__(self, model_name: str = ""):
+        """Load selected model and tokenizer."""
         self.model_name = model_name
         if model_name:
             self.model = AutoModelForMaskedLM.from_pretrained(model_name)
             self.batch_converter = AutoTokenizer.from_pretrained(model_name)
             self.alphabet = self.batch_converter.get_vocab()
+            # Check if CUDA is available and if so, use it
             if torch.cuda.is_available():
                 self.model = self.model.cuda()
+    def tokenise(self, input: str) -> BatchEncoding:
+        """Convert input string to batch of tokens."""
+        return self.batch_converter(input, return_tensors="pt")
+    def __call__(self, batch_tokens: torch.Tensor, **kwargs) -> MaskedLMOutput:
+        """Run model on batch of tokens."""
+        return self.model(batch_tokens, **kwargs)
+    def __getitem__(self, key: str) -> int:
+        """Get token ID from character."""
         return self.alphabet[key]
     def run_model(self, data):
+        """Run model on data."""
         def label_row(row, token_probs):
+            """Label row with score."""
+            # Extract wild type, index and mutant type from the row
             wt, idx, mt = row[0], int(row[1:-1])-1, row[-1]
+            # Calculate the score as the difference between the token probabilities of the mutant type and the wild type
             score = token_probs[0, 1+idx, self[mt]] - token_probs[0, 1+idx, self[wt]]
             return score.item()
+        # Tokenise the sequence data
+        batch_tokens = self.tokenise(data.seq).input_ids
+        # Calculate the token probabilities without updating the model parameters
+        with torch.no_grad():
+            token_probs = torch.log_softmax(self(batch_tokens).logits, dim=-1)
+        # Store the token probabilities in the data
+        data.token_probs = token_probs.cpu().numpy()
+        # If the scoring strategy starts with "masked-marginals"
+        if data.scoring_strategy.startswith("masked-marginals"):
             all_token_probs = []
+            # For each token in the batch
             for i in range(batch_tokens.size()[1]):
+                # If the token is in the list of residues
+                if i in data.resi:
+                    # Clone the batch tokens and mask the current token
+                    batch_tokens_masked = batch_tokens.clone()
+                    batch_tokens_masked[0, i] = self['<mask>']
+                    # Calculate the masked token probabilities
+                    with torch.no_grad():
+                        masked_token_probs = torch.log_softmax(
+                            self(batch_tokens_masked).logits, dim=-1
+                        )
+                else:
+                    # If the token is not in the list of residues, use the original token probabilities
+                    masked_token_probs = token_probs
+                # Append the token probabilities to the list
+                all_token_probs.append(masked_token_probs[:, i])
+            # Concatenate all token probabilities
             token_probs = torch.cat(all_token_probs, dim=0).unsqueeze(0)
+        # Apply the label_row function to each row of the substitutions dataframe
+        data.out[self.model_name] = data.sub.apply(
+            lambda row: label_row(
+                row['0'],
+                token_probs,
+            ),
+            axis=1,
+        )

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 gradio
-huggingface_hub
 pandas
 torch
 transformers

 gradio
 pandas
+seaborn
 torch
 transformers