import gradio as gr import numpy as np import os import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from transformers import pipeline as pl import numpy as np import matplotlib.pyplot as plt import sys print(os.getcwd()) if "/home/user/app/alphafold" not in sys.path: sys.path.append("/home/user/app/alphafold") from alphafold.common import protein from alphafold.data import pipeline from alphafold.data import templates from alphafold.model import data from alphafold.model import config from alphafold.model import model def update_seqs(choice): return gr.Textbox.update(choice) def mk_mock_template(query_sequence): """create blank template""" ln = len(query_sequence) output_templates_sequence = "-" * ln templates_all_atom_positions = np.zeros( (ln, templates.residue_constants.atom_type_num, 3) ) templates_all_atom_masks = np.zeros((ln, templates.residue_constants.atom_type_num)) templates_aatype = templates.residue_constants.sequence_to_onehot( output_templates_sequence, templates.residue_constants.HHBLITS_AA_TO_ID ) template_features = { "template_all_atom_positions": templates_all_atom_positions[None], "template_all_atom_masks": templates_all_atom_masks[None], "template_aatype": np.array(templates_aatype)[None], "template_domain_names": [f"none".encode()], } return template_features def predict_structure(prefix, feature_dict, model_runners, random_seed=0): """Predicts structure using AlphaFold for the given sequence.""" # Run the models. plddts = {} for model_name, model_runner in model_runners.items(): processed_feature_dict = model_runner.process_features( feature_dict, random_seed=random_seed ) prediction_result = model_runner.predict(processed_feature_dict) b_factors = ( prediction_result["plddt"][:, None] * prediction_result["structure_module"]["final_atom_mask"] ) unrelaxed_protein = protein.from_prediction( processed_feature_dict, prediction_result, b_factors ) unrelaxed_pdb_path = f"{prefix}_unrelaxed_{model_name}.pdb" plddts[model_name] = prediction_result["plddt"] print(f"{model_name} {plddts[model_name].mean()}") with open(unrelaxed_pdb_path, "w") as f: f.write(protein.to_pdb(unrelaxed_protein)) return plddts def run_protgpt2(startsequence, length): protgpt2 = pl("text-generation", model="nferruz/ProtGPT2") sequences = protgpt2( startsequence, max_length=length, do_sample=True, top_k=950, repetition_penalty=1.2, num_return_sequences=5, eos_token_id=0, ) return sequences def run_alphafold(startsequence): model_runners = {} models = ["model_1"] # ,"model_2","model_3","model_4","model_5"] for model_name in models: model_config = config.model_config(model_name) model_config.data.eval.num_ensemble = 1 model_params = data.get_model_haiku_params(model_name=model_name, data_dir=".") model_runner = model.RunModel(model_config, model_params) model_runners[model_name] = model_runner query_sequence = startsequence.replace("\n", "") feature_dict = { **pipeline.make_sequence_features( sequence=query_sequence, description="none", num_res=len(query_sequence) ), **pipeline.make_msa_features( msas=[[query_sequence]], deletion_matrices=[[[0] * len(query_sequence)]] ), **mk_mock_template(query_sequence), } plddts = predict_structure("test", feature_dict, model_runners) return plddts["model_1"] def update_protGPT2(inp, length): startsequence = inp seqlen = length generated_seqs = run_protgpt2(startsequence, seqlen) gen_seqs = [x["generated_text"] for x in generated_seqs] print(gen_seqs) return gr.Radio.update(gen_seqs) def update(inp): print("Running AF on", inp) startsequence = inp plddts = run_alphafold(startsequence) print(plddts) x = np.arange(10) plt.style.use(["seaborn-ticks", "seaborn-talk"]) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(plddts) ax.set_ylabel("predicted LDDT") ax.set_xlabel("positions") ax.set_title("pLDDT") return ( molecule( f"test_unrelaxed_model_1.pdb", ), fig, f"{np.mean(plddts):.1f} ± {np.std(plddts):.1f}", ) def read_mol(molpath): with open(molpath, "r") as fp: lines = fp.readlines() mol = "" for l in lines: mol += l return mol def molecule(pdb): mol = read_mol(pdb) x = ( """
AlphaFold model confidence:
 Very high (pLDDT > 90)
 Confident (90 > pLDDT > 70)
 Low (70 > pLDDT > 50)
 Very low (pLDDT < 50)
AlphaFold produces a per-residue confidence score (pLDDT) between 0 and 100. Some regions below 50 pLDDT may be unstructured in isolation.
""" ) return f"""""" proteindream = gr.Blocks() with proteindream: gr.Markdown("# GradioFold") gr.Markdown( """GradioFold is a web-based tool that combines a large language model trained on natural protein sequence (protGPT2) with structure prediction using AlphaFold. Type a start sequence or provide a sequence with blanks that protGPT2 can complete.""" ) gr.Markdown("## protGPT2") gr.Markdown( """ Enter a start sequence and have the language model complete it. """ ) with gr.Group(): with gr.Row(): inp = gr.Textbox(placeholder="M", label="Start sequence") length = gr.Number(value=50, label="Target sequence length") btn = gr.Button("Autocomplete sequences") seqs = [ "MTAEADPAPLAANPPAPVRPIQFHDVSVRYEARPWLRALWDVASGSFIGLLGASGAGKSTCVDLLNGVRKPSSGERFVRGQPSRGRKGRFNRRVAMVFQDVRHQLFSRSVAREIAFGLENLPTSAAAIDRRVS", "MTAGIVAGGIAGGVAGYKAKKHRKAVKATMIAAGVSGGIGGGYIGEKFNRRLAKHEDRVRRSAPRHKKHSSYSKSSGEGGGILGKLFGR", "MTAVLVAIALEMQNPHRMALAAVLCGQFTVAVAAEPFAPEGVAEGLNPLGDLLAESPLLEVVSATLALLVALGTATSLSWISGPVSALPAPSFQSSETPYPQRPIERESFDQDSREEDPWDRL", "MTARVRNRSSSRSYVLDFADLADGQREVLLPESRGNASEVDLPAGTTVNVTIDVTASGTGTLTARTPDGADVVSNEYELTVERDTDLTRVETESPQVAAGETATVTGTAENVGTVAGEREVTAYVDGE", "MTAAGWREEGTPFARIARQLGRHVTSVRQAAGRVRQQMGLTSPDPADPPRSGPTPTIPIEQERA", ] seqChoice = gr.Radio(seqs, label="Generated sequences") btn.click(fn=update_protGPT2, inputs=[inp, length], outputs=seqChoice) gr.Markdown("## AlphaFold") gr.Markdown( "Select a generated sequence above for structure prediction using AlphaFold2." ) with gr.Group(): chosenSeq = gr.Textbox(label="Chosen sequence") btn2 = gr.Button("Predict structure") with gr.Group(): meanpLDDT = gr.Textbox(label="Mean pLDDT of chosen sequence") with gr.Row(): mol = gr.HTML() plot = gr.Plot(label="pLDDT") gr.Markdown( """## Acknowledgements This was a fun demo using Gradio, Huggingface and ColabFold. More information about the used algorithms can be found below. All code is available on [Github]() and licensed under MIT license. - ProtGPT2: Ferruz et.al [BioRxiv](https://doi.org/10.1101/2022.03.09.483666) [Code](https://huggingface.co/nferruz/ProtGPT2) - AlphaFold2: Jumper et.al [Paper](https://doi.org/10.1038/s41586-021-03819-2) [Code](https://github.com/deepmind/alphafold) Model parameters released under CC BY 4.0 - ColabFold: Mirdita et.al [Paper](https://doi.org/10.1101/2021.08.15.456425 ) [Code](https://github.com/sokrypton/ColabFold) Created by [@simonduerr](https://twitter.com/simonduerr) """ ) seqChoice.change(fn=update_seqs, inputs=seqChoice, outputs=chosenSeq) btn2.click(fn=update, inputs=seqChoice, outputs=[mol, plot, meanpLDDT]) proteindream.launch(share=False)