Spaces:

thaidaev
/

zsp

Sleeping

App Files Files

MassimoGregorioTotaro commited on Jun 20, 2023

Commit

2dd6312

•

1 Parent(s): 5543c12

general reorganisation

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +162 -202
instructions.md +13 -0
requirements.txt +0 -1

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ */

app.py CHANGED Viewed

@@ -1,218 +1,178 @@
 from huggingface_hub import HfApi, ModelFilter
-from transformers import AutoTokenizer, AutoModelForMaskedLM
 import pandas as pd
-import re
-from tqdm import tqdm
 import torch
-import gradio as gr
-import warnings
-warnings.filterwarnings('ignore')
-MODEL, MODEL_NAME, BATCH_CONVERTER, ALPHABET = None, None, None, None
-OFFSET = 1
 MODELS = [m.modelId for m in HfApi().list_models(filter=ModelFilter(author="facebook", model_name="esm", task="fill-mask"), sort="lastModified", direction=-1)]
-SCORING = ["masked-marginals (more accurate)", "wt-marginals (faster)"]
-def label_row(row, sequence, token_probs):
-    wt, idx, mt = row[0], int(row[1:-1]) - OFFSET, row[-1]
-    assert sequence[idx] == wt, "The listed wildtype does not match the provided sequence"
-    wt_encoded, mt_encoded = ALPHABET[wt], ALPHABET[mt]
-    score = token_probs[0, 1 + idx, mt_encoded] - token_probs[0, 1 + idx, wt_encoded]
-    return score.item()
-def initialise_model(model_name):
-    global MODEL, MODEL_NAME, BATCH_CONVERTER, ALPHABET
-    MODEL_NAME = model_name
-    MODEL = AutoModelForMaskedLM.from_pretrained(model_name)
-    BATCH_CONVERTER = AutoTokenizer.from_pretrained(model_name)
-    ALPHABET = BATCH_CONVERTER.get_vocab()
-    if torch.cuda.is_available():
-        MODEL = MODEL.cuda()
-def parse_input(seq, sub):
-    assert seq.isalpha(), "Sequence must be alphabetic"
-    substitutions, mode = list(), None
-    if len(sub.split()) == 1 and len(sub.split()[0]) == len(seq):
-        mode = 'seq vs seq'
-        for resi,(src,trg) in enumerate(zip(seq,sub), OFFSET):
-            if src != trg:
-                substitutions.append(f"{src}{resi}{trg}")
-    elif len(targets := sub.split()) > 1:
-        if all(re.match(r'\d+', x) for x in targets):
-            mode = 'deep mutational scan'
-            for resi in map(int, sub.split()):
-                src = seq[resi-OFFSET]
-                for trg in "ACDEFGHIKLMNPQRSTVWY".replace(src,''):
-                    substitutions.append(f"{src}{resi}{trg}")
-        elif all(re.match(r'[A-Z]\d+[A-Z]', x) for x in targets):
-            mode = 'aa substitutions'
-            substitutions = targets
-    if not mode:
-        raise RuntimeError("Unrecognised running mode")
-    return mode, pd.DataFrame(substitutions, columns=['0'])
-def run_model(sequence, substitutions, batch_tokens, scoring_strategy):
-    if scoring_strategy.startswith("wt-marginals"):
-        with torch.no_grad():
-            token_probs = torch.log_softmax(MODEL(batch_tokens)["logits"], dim=-1)
-        substitutions[MODEL_NAME] = substitutions.apply(
-            lambda row: label_row(
-                row['0'],
-                sequence,
-                token_probs,
-            ),
-            axis=1,
-        )
-    elif scoring_strategy.startswith("masked-marginals"):
-        all_token_probs = []
-        for i in tqdm(range(batch_tokens.size()[1])):
-            batch_tokens_masked = batch_tokens.clone()
-            batch_tokens_masked[0, i] = ALPHABET['<mask>']
             with torch.no_grad():
-                token_probs = torch.log_softmax(
-                    MODEL(batch_tokens_masked)["logits"], dim=-1
-                )
-            all_token_probs.append(token_probs[:, i])
-        token_probs = torch.cat(all_token_probs, dim=0).unsqueeze(0)
-        substitutions[MODEL_NAME] = substitutions.apply(
-            lambda row: label_row(
-                row['0'],
-                sequence,
-                token_probs,
-            ),
-            axis=1,
-        )
-    return substitutions
-def parse_output(output, mode):
-    if mode == 'aa substitutions':
-        output = output.sort_values(MODEL_NAME, ascending=False)
-    elif mode == 'deep mutational scan':
-        output = pd.concat([(output.assign(resi=output['0'].str.extract(r'(\d+)', expand=False).astype(int))
-                .sort_values(['resi', MODEL_NAME], ascending=[True,False])
-                .groupby(['resi'])
-                .head(19)
-                .drop(['resi'], axis=1)).iloc[19*x:19*(x+1)].reset_index(drop=True) for x in range(output.shape[0]//19)]
-                , axis=1).set_axis(range(output.shape[0]//19*2), axis='columns')
-    return output.style.format(lambda x: f'{x:.2f}' if isinstance(x, float) else x).hide_index().hide_columns().background_gradient(cmap="RdYlGn", vmax=8, vmin=-8).to_html()
-# mode = 'deep mutational scan'  #@param ['seq vs seq', 'deep mutational scan', 'aa substitutions']
-# sequence = "MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ" #@param {type:"string"}
-# target = "61 214 19 30 122 140" #@param {type:"string"}
-# substitutions = list()
-# scoring_strategy = "masked-marginals"
-# if mode == 'seq vs seq':
-#   for resi,(seq,trg) in enumerate(zip(sequence,target), OFFSET):
-#     if seq != trg:
-#       substitutions.append(f"{seq}{resi}{trg}")
-# elif mode == 'deep mutational scan':
-#   for resi in map(int, target.split()):
-#     seq = sequence[resi-OFFSET]
-#     for trg in "ACDEFGHIKLMNPQRSTVWY".replace(seq,''):
-#       substitutions.append(f"{seq}{resi}{trg}")
-# elif mode == 'aa substitutions':
-#   substitutions = target.split()
-# else:
-#   raise RuntimeError("Unrecognised running mode")
-# df = pd.DataFrame(substitutions, columns=['0'])
-# mutation_col = df.columns[0]
-# batch_tokens = batch_converter(sequence, return_tensors='pt')['input_ids']
-# if scoring_strategy == "wt-marginals":
-#     with torch.no_grad():
-#         token_probs = torch.log_softmax(model(batch_tokens)["logits"], dim=-1)
-#     df[model_name] = df.apply(
-#         lambda row: label_row(
-#             row[mutation_col],
-#             sequence,
-#             token_probs,
-#             alphabet,
-#             OFFSET,
-#         ),
-#         axis=1,
-#     )
-# elif scoring_strategy == "masked-marginals":
-#     all_token_probs = []
-#     for i in tqdm(range(batch_tokens.size()[1])):
-#         batch_tokens_masked = batch_tokens.clone()
-#         batch_tokens_masked[0, i] = alphabet['<mask>']
-#         with torch.no_grad():
-#             token_probs = torch.log_softmax(
-#                 model(batch_tokens_masked)["logits"], dim=-1
-#             )
-#         all_token_probs.append(token_probs[:, i])  # vocab size
-#     token_probs = torch.cat(all_token_probs, dim=0).unsqueeze(0)
-#     df[model_name] = df.apply(
-#         lambda row: label_row(
-#             row[mutation_col],
-#             sequence,
-#             token_probs,
-#             alphabet,
-#             OFFSET,
-#         ),
-#         axis=1,
-#     )
-# if mode == 'aa substitutions':
-#   df = df.sort_values(model_name, ascending=False)
-# elif mode == 'deep mutational scan':
-#   df = pd.concat([(df.assign(resi=df['0'].str.extract(f'(\d+)', expand=False).astype(int))
-#           .sort_values(['resi', model_name], ascending=[True,False])
-#           .groupby(['resi'])
-#           .head(19)
-#           .drop(['resi'], axis=1)).iloc[19*x:19*(x+1)].reset_index(drop=True) for x in range(df.shape[0]//19)]
-#         , axis=1).set_axis(range(df.shape[0]//19*2), axis='columns')
-# df.style.hide_index().hide_columns().background_gradient(cmap="RdYlGn", vmax=8, vmin=-8)
 def app(*argv):
-    seq, trg, model_name, scoring_strategy, *_ = argv
-    mode, substitutions = parse_input(seq, trg)
-    if model_name != MODEL_NAME:
-        initialise_model(model_name)
-    batch_tokens = BATCH_CONVERTER(seq, return_tensors='pt')['input_ids']
-    df = run_model(seq, substitutions, batch_tokens, scoring_strategy)
-    return parse_output(df, mode)
-# demo = gr.Interface(
-#     theme=gr.themes.Base(),
-#     title="Protein Sequence Mutagenesis",
-#     description="Predict the effect of mutations on protein stability",
-#     fn=app,
-#     inputs=[gr.Textbox(lines=2, label="Sequence", placeholder="Sequence here...", required=True, value='MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ'),
-#             gr.Textbox(lines=2, label="Substitutions", placeholder="Substitutions here...", required=True, value="61 214 19 30 122 140"),
-#             gr.Dropdown(MODELS, label="Model", value=MODELS[1]),
-#             gr.Dropdown(["masked-marginals (more accurate)", "wt-marginals (faster)"], label="Scoring strategy", value="wt-marginals (faster)"),
-#             ],
-#     outputs=gr.HTML(formatter="html", label="Output"),
-# )
-with gr.Blocks() as demo:
-    gr.Markdown("""Protein Sequence Mutagenesis""", name="title")
-    gr.Markdown("""Predict the effect of mutations on protein stability""", name="description")
-    seq = gr.Textbox(lines=2, label="Sequence", placeholder="Sequence here...", required=True, value='MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ')
-    trg = gr.Textbox(lines=1, label="Substitutions", placeholder="Substitutions here...", required=True, value="61 214 19 30 122 140")
     model_name = gr.Dropdown(MODELS, label="Model", value=MODELS[1])
     scoring_strategy = gr.Dropdown(SCORING, label="Scoring strategy", value=SCORING[1])
-    btn = gr.Button(label="Submit", type="submit")
-    btn.click(fn=app, inputs=[seq, trg, model_name, scoring_strategy], outputs=[gr.HTML()])
-if __name__ == '__main__':
-    demo.launch()
-    # demo.launch(share=True, server_name="0.0.0.0", server_port=7878)

+import gradio as gr
 from huggingface_hub import HfApi, ModelFilter
 import pandas as pd
+from re import match
+from tempfile import NamedTemporaryFile
 import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+    # fetch suitable ESM models from HuggingFace Hub
 MODELS = [m.modelId for m in HfApi().list_models(filter=ModelFilter(author="facebook", model_name="esm", task="fill-mask"), sort="lastModified", direction=-1)]
+if not any(MODELS):
+    raise RuntimeError("Error while retrieving models from HuggingFace Hub")
+    # scoring strategies
+SCORING = ["masked-marginals (more accurate)", "wt-marginals (faster)"]
+class Model:
+    """Wrapper for ESM models"""
+    def __init__(self, model_name:str=""):
+        "load selected model and tokenizer"
+        self.model_name = model_name
+        if model_name:
+            self.model = AutoModelForMaskedLM.from_pretrained(model_name)
+            self.batch_converter = AutoTokenizer.from_pretrained(model_name)
+            self.alphabet = self.batch_converter.get_vocab()
+            if torch.cuda.is_available():
+                self.model = self.model.cuda()
+    def __rshift__(self, batch_tokens:torch.Tensor) -> torch.Tensor:
+        "run model on batch of tokens"
+        return self.model(batch_tokens)["logits"]
+    def __lshift__(self, input:str) -> torch.Tensor:
+        "convert input string to batch of tokens"
+        return self.batch_converter(input, return_tensors="pt")["input_ids"]
+    def __getitem__(self, key:str) -> int:
+        "get token ID from character"
+        return self.alphabet[key]
+    def run_model(self, data):
+        "run model on data"
+        def label_row(row, token_probs):
+            "label row with score"
+            wt, idx, mt = row[0], int(row[1:-1])-1, row[-1]
+            score = token_probs[0, 1+idx, self[mt]] - token_probs[0, 1+idx, self[wt]]
+            return score.item()
+        batch_tokens = self<<data.seq
+            # run model with selected scoring strategy (info thereof available in the original ESM paper)
+        if data.scoring_strategy.startswith("wt-marginals"):
             with torch.no_grad():
+                token_probs = torch.log_softmax(self>>batch_tokens, dim=-1)
+            data.out[self.model_name] = data.sub.apply(
+                lambda row: label_row(
+                    row['0'],
+                    token_probs,
+                ),
+                axis=1,
+            )
+        elif data.scoring_strategy.startswith("masked-marginals"):
+            all_token_probs = []
+            for i in range(batch_tokens.size()[1]):
+                batch_tokens_masked = batch_tokens.clone()
+                batch_tokens_masked[0, i] = self['<mask>']
+                with torch.no_grad():
+                    token_probs = torch.log_softmax(
+                        self>>batch_tokens_masked, dim=-1
+                    )
+                all_token_probs.append(token_probs[:, i])
+            token_probs = torch.cat(all_token_probs, dim=0).unsqueeze(0)
+            data.out[self.model_name] = data.sub.apply(
+                lambda row: label_row(
+                    row['0'],
+                    token_probs,
+                ),
+                axis=1,
+            )
+class Data:
+    """Container for input and output data"""
+        # initialise empty model as static class member for efficiency
+    model = Model()
+    def parse_seq(self, src:str):
+        "parse input sequence"
+        self.seq = src.strip().upper()
+        if not all(x in self.model.alphabet for x in src):
+            raise RuntimeError("Unrecognised characters in sequence")
+    def parse_sub(self, trg:str):
+        "parse input substitutions"
+        self.mode = None
+        self.sub = list()
+        self.trg = trg.strip().upper()
+            # identify running mode
+        if len(self.trg.split()) == 1 and len(self.trg.split()[0]) == len(self.seq):    # if single string of same length as sequence, seq vs seq mode
+            self.mode = 'SVS'
+            for resi,(src,trg) in enumerate(zip(self.seq, self.trg), 1):
+                if src != trg:
+                    self.sub.append(f"{src}{resi}{trg}")
+        else:
+            self.trg = self.trg.split()
+            if all(match(r'\d+', x) for x in self.trg):                                 # if all strings are numbers, deep mutational scanning mode
+                self.mode = 'DMS'
+                for resi in map(int, self.trg):
+                    src = self.seq[resi-1]
+                    for trg in "ACDEFGHIKLMNPQRSTVWY".replace(src,''):
+                        self.sub.append(f"{src}{resi}{trg}")
+            elif all(match(r'[A-Z]\d+[A-Z]', x) for x in self.trg):                     # if all strings are of the form X#Y, single substitution mode
+                self.mode = 'MUT'
+                self.sub = self.trg
+            else:
+                raise RuntimeError("Unrecognised running mode; wrong inputs?")
+        self.sub = pd.DataFrame(self.sub, columns=['0'])
+    def __init__(self, src:str, trg:str, model_name:str, scoring_strategy:str, out_file):
+        "initialise data"
+            # if model has changed, load new model
+        if self.model.model_name != model_name:
+            self.model_name = model_name
+            self.model = Model(model_name)
+        self.parse_seq(src)
+        self.parse_sub(trg)
+        self.scoring_strategy = scoring_strategy
+        self.out = pd.DataFrame(self.sub, columns=['0', self.model_name])
+        self.out_buffer = out_file.name
+    def parse_output(self) -> str:
+        "format output data for visualisation"
+        if self.mode == 'MUT':      # if single substitution mode, sort by score
+            self.out = self.out.sort_values(self.model_name, ascending=False)
+        elif self.mode == 'DMS':    # if deep mutational scanning mode, sort by residue and score
+            self.out = pd.concat([(self.out.assign(resi=self.out['0'].str.extract(r'(\d+)', expand=False).astype(int))      # FIX: this doesn't work if there's jolly characters in the input sequence
+                                    .sort_values(['resi', self.model_name], ascending=[True,False])
+                                    .groupby(['resi'])
+                                    .head(19)
+                                    .drop(['resi'], axis=1)).iloc[19*x:19*(x+1)]
+                                                            .reset_index(drop=True) for x in range(self.out.shape[0]//19)]
+                                , axis=1).set_axis(range(self.out.shape[0]//19*2), axis='columns')
+            # save to temporary file to be downloaded
+        self.out.round(2).to_csv(self.out_buffer, index=False)
+        return (self.out.style
+                        .format(lambda x: f'{x:.2f}' if isinstance(x, float) else x)
+                        .hide(axis=0)
+                        .hide(axis=1)
+                        .background_gradient(cmap="RdYlGn", vmax=8, vmin=-8)
+                        .to_html())
+    def calculate(self):
+        "run model and parse output"
+        self.model.run_model(self)
+        return self, self.parse_output()
 def app(*argv):
+    "run app"
+    seq, trg, model_name, scoring_strategy, out_file, *_ = argv
+    data, html = Data(seq, trg, model_name, scoring_strategy, out_file).calculate()
+    return html, gr.File.update(value=out_file.name, visible=True)
+    # df = pd.DataFrame((pd.np.random.random((10, 5))-0.5)*10, columns=list('ABCDE'))
+    # df.to_csv(out_file.name, index=False)
+    # return df.to_html(), gr.File.update(value=out_file.name, visible=True)
+with gr.Blocks() as demo, NamedTemporaryFile(mode='w+', prefix='out_', suffix='.csv') as out_file, open("instructions.md", "r") as md:
+    gr.Markdown(md.read())
+    seq = gr.Textbox(lines=2, label="Sequence", placeholder="Sequence here...", value='MVEQYLLEAIVRDARDGITISDCSRPDNPLVFVNDAFTRMTGYDAEEVIGKNCRFLQRGDINLSAVHTIKIAMLTHEPCLVTLKNYRKDGTIFWNELSLTPIINKNGLITHYLGIQKDVSAQVILNQTLHEENHLLKSNKEMLEYLVNIDALTGLHNRRFLEDQLVIQWKLASRHINTITIFMIDIDYFKAFNDTYGHTAGDEALRTIAKTLNNCFMRGSDFVARYGGEEFTILAIGMTELQAHEYSTKLVQKIENLNIHHKGSPLGHLTISLGYSQANPQYHNDQNLVIEQADRALYSAKVEGKNRAVAYREQ')
+    trg = gr.Textbox(lines=1, label="Substitutions", placeholder="Substitutions here...", value="61 214 19 30 122 140")
     model_name = gr.Dropdown(MODELS, label="Model", value=MODELS[1])
     scoring_strategy = gr.Dropdown(SCORING, label="Scoring strategy", value=SCORING[1])
+    btn = gr.Button(value="Submit")
+    out = gr.HTML()
+    bto = gr.File(value=out_file.name, visible=False, label="Download", file_count='single', interactive=False)
+    btn.click(fn=app, inputs=[seq, trg, model_name, scoring_strategy, bto], outputs=[out, bto])
+# demo.launch(share=True, server_name="0.0.0.0", server_port=7878)

instructions.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# **ESM zero-shot variant prediction**
+this was inspired from this [paper](https://doi.org/10.1101/2021.07.09.450648) and adaptated from [this repo](https://github.com/facebookresearch/esm/tree/main/esm)
+#### **Instructions**
+- in the 'sequence' text box the protein full amino acid sequence that is to be analysed must be given, jolly charachters (e.g. -X.B) are supported (but at the moment the visualisation does not show the correct results)
+- there's three running modes that can be chosen, depending on the input in the 'substitution' box:
+  - if another sequence is given, the positions that are different between the two will be evaluated (NB the sequences must be of the same length) and their score returned
+  - if a list of integers is given, a deep mutational scan will be performed at those positions in the input sequence and the scores for the amino acids, different from the original one, will be returned
+  - if a single substitution or a list thereof is given (in the form of **B008S**), the single substitution score is returned
+- you can choose which ESM model to use for the calculations, these models are the ones that are available at runtime on Hugging Face Model Hub
+- there's 2 scoring strategies available: wt-marginals and masked marginals; the first one is faster, but less accurate, the second one considers the sequence context more thoroughly, but is sensibly slower (the run time scales linearly with sequence length)
+- the results will be shown in a table, with color coding and sorted by fitness (if performing a deep mutational scan)
+- the output data is available for download from the box at the bottom as a CSV file

requirements.txt CHANGED Viewed

@@ -2,5 +2,4 @@ gradio
 huggingface_hub
 pandas
 torch
-tqdm
 transformers

 huggingface_hub
 pandas
 torch
 transformers