import gradio as gr import numpy as np import os import ray import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from transformers import pipeline as pl from GPUtil import showUtilization as gpu_usage import pandas as pd import numpy as np import matplotlib.pyplot as plt import sys import plotly.graph_objects as go import torch import gc import jax from numba import cuda print('GPU available',torch.cuda.is_available()) print('__CUDA Device Name:',torch.cuda.get_device_name(0)) print(os.getcwd()) if "/home/user/app/alphafold" not in sys.path: sys.path.append("/home/user/app/alphafold") from alphafold.common import protein from alphafold.data import pipeline from alphafold.data import templates from alphafold.model import data from alphafold.model import config from alphafold.model import model def mk_mock_template(query_sequence): """create blank template""" ln = len(query_sequence) output_templates_sequence = "-" * ln templates_all_atom_positions = np.zeros( (ln, templates.residue_constants.atom_type_num, 3) ) templates_all_atom_masks = np.zeros((ln, templates.residue_constants.atom_type_num)) templates_aatype = templates.residue_constants.sequence_to_onehot( output_templates_sequence, templates.residue_constants.HHBLITS_AA_TO_ID ) template_features = { "template_all_atom_positions": templates_all_atom_positions[None], "template_all_atom_masks": templates_all_atom_masks[None], "template_aatype": np.array(templates_aatype)[None], "template_domain_names": [f"none".encode()], } return template_features def predict_structure(prefix, feature_dict, model_runners, random_seed=0): """Predicts structure using AlphaFold for the given sequence.""" # Run the models. # currently we only run model1 plddts = {} for model_name, model_runner in model_runners.items(): processed_feature_dict = model_runner.process_features( feature_dict, random_seed=random_seed ) prediction_result = model_runner.predict(processed_feature_dict) b_factors = ( prediction_result["plddt"][:, None] * prediction_result["structure_module"]["final_atom_mask"] ) unrelaxed_protein = protein.from_prediction( processed_feature_dict, prediction_result, b_factors ) unrelaxed_pdb_path = f"{prefix}_unrelaxed_{model_name}.pdb" plddts[model_name] = prediction_result["plddt"] print(f"{model_name} {plddts[model_name].mean()}") with open(unrelaxed_pdb_path, "w") as f: f.write(protein.to_pdb(unrelaxed_protein)) return plddts @ray.remote def run_protgpt2(startsequence, length, repetitionPenalty, top_k_poolsize, max_seqs): print("running protgpt2") print(gpu_usage()) protgpt2 = pl("text-generation", model="nferruz/ProtGPT2") sequences = protgpt2( startsequence, max_length=length, do_sample=True, top_k=top_k_poolsize, repetition_penalty=repetitionPenalty, num_return_sequences=max_seqs, eos_token_id=0, ) print("Cleaning up after protGPT2") #print(gpu_usage()) #torch.cuda.empty_cache() #device = cuda.get_current_device() #device.reset() #print(gpu_usage()) return sequences @ray.remote def run_alphafold(startsequence): print(gpu_usage()) model_runners = {} models = ["model_1"] # ,"model_2","model_3","model_4","model_5"] for model_name in models: model_config = config.model_config(model_name) model_config.data.eval.num_ensemble = 1 model_params = data.get_model_haiku_params(model_name=model_name, data_dir=".") model_runner = model.RunModel(model_config, model_params) model_runners[model_name] = model_runner query_sequence = startsequence.replace("\n", "") feature_dict = { **pipeline.make_sequence_features( sequence=query_sequence, description="none", num_res=len(query_sequence) ), **pipeline.make_msa_features( msas=[[query_sequence]], deletion_matrices=[[[0] * len(query_sequence)]] ), **mk_mock_template(query_sequence), } plddts = predict_structure("test", feature_dict, model_runners) print("AF2 done") #backend = jax.lib.xla_bridge.get_backend() #for buf in backend.live_buffers(): buf.delete() #device = cuda.get_current_device() #device.reset() #print(gpu_usage()) return plddts["model_1"] def update_protGPT2(inp, length,repetitionPenalty, top_k_poolsize, max_seqs): startsequence = inp seqlen = length generated_seqs = ray.get(run_protgpt2.remote(startsequence, seqlen, repetitionPenalty, top_k_poolsize, max_seqs)) gen_seqs = [x["generated_text"] for x in generated_seqs] print(gen_seqs) sequencestxt = "" for i, seq in enumerate(gen_seqs): s = seq.replace("\n","") s = "\n".join([s[i:i+70] for i in range(0, len(s), 70)]) sequencestxt +=f">seq{i}\n{s}\n" return sequencestxt def update(inp): print("Running AF on", inp) startsequence = inp # run alphafold using ray plddts = ray.get(run_alphafold.remote(startsequence)) print(plddts) x = np.arange(10) #plt.style.use(["seaborn-ticks", "seaborn-talk"]) #fig = plt.figure() #ax = fig.add_subplot(111) #ax.plot(plddts) #ax.set_ylabel("predicted LDDT") #ax.set_xlabel("positions") #ax.set_title("pLDDT") fig = go.Figure(data=go.Scatter(x=np.arange(len(plddts)), y=plddts, hovertemplate='pLDDT: %{y:.2f}
Residue index: %{x}')) fig.update_layout(title="pLDDT", xaxis_title="Residue index", yaxis_title="pLDDT", height=500, template="simple_white") return ( molecule( f"test_unrelaxed_model_1.pdb", ), fig, f"{np.mean(plddts):.1f} ± {np.std(plddts):.1f}", ) def read_mol(molpath): with open(molpath, "r") as fp: lines = fp.readlines() mol = "" for l in lines: mol += l return mol def molecule(pdb): mol = read_mol(pdb) x = ( """
AlphaFold model confidence:
 Very high (pLDDT > 90)
 Confident (90 > pLDDT > 70)
 Low (70 > pLDDT > 50)
 Very low (pLDDT < 50)
AlphaFold produces a per-residue confidence score (pLDDT) between 0 and 100. Some regions below 50 pLDDT may be unstructured in isolation.
""" ) return f"""""" def change_sequence(chosenSeq): return chosenSeq proteindream = gr.Blocks() with proteindream: gr.Markdown("# GradioFold") gr.Markdown( """GradioFold is a web-based tool that combines a large language model trained on natural protein sequence (protGPT2) with structure prediction using AlphaFold. Type a start sequence that protGPT2 can complete or let protGPT2 generate a complete sequence.""" ) gr.Markdown("## protGPT2") gr.Markdown( """ Enter a start sequence and have the language model complete it OR leave empty. """ ) with gr.Box(): with gr.Row(): inp = gr.Textbox(placeholder="M", label="Start sequence") length = gr.Number(value=50, label="Max sequence length") with gr.Row(): repetitionPenalty = gr.Slider(minimum=1, maximum=5,value=1.2, label="Repetition penalty") top_k_poolsize = gr.Slider(minimum=700, maximum=52056,value=950, label="Top-K sampling pool size") max_seqs = gr.Slider(minimum=2, maximum=20,value=5, label="Number of sequences to generate") btn = gr.Button("Predict sequences using protGPT2") results = gr.Textbox(label="Results", lines=15) btn.click(fn=update_protGPT2, inputs=[inp, length, repetitionPenalty, top_k_poolsize, max_seqs], outputs=results) gr.Markdown("## AlphaFold") gr.Markdown( "Select a generated sequence above and copy it in the field below for structure prediction using AlphaFold2." ) with gr.Group(): chosenSeq = gr.Textbox(label="Chosen sequence") btn2 = gr.Button("Predict structure") with gr.Group(): meanpLDDT = gr.Textbox(label="Mean pLDDT of chosen sequence") with gr.Row(): mol = gr.HTML() plot = gr.Plot(label="pLDDT") gr.Markdown( """## Acknowledgements This was a fun demo using Gradio, Huggingface Spaces and ColabFold as inspiration. More information about the used algorithms can be found below. All code is available on [Huggingface](https://huggingface.co/spaces/simonduerr/protGPT2_gradioFold/blob/main) and licensed under MIT license. - ProtGPT2: Ferruz et.al 📄[BioRxiv](https://doi.org/10.1101/2022.03.09.483666) 💻[Code](https://huggingface.co/nferruz/ProtGPT2) - AlphaFold2: Jumper et.al 📄[Paper](https://doi.org/10.1038/s41586-021-03819-2) 💻[Code](https://github.com/deepmind/alphafold) Model parameters released under CC BY 4.0 - ColabFold: Mirdita et.al 📄[Paper](https://doi.org/10.1101/2021.08.15.456425 ) 💻[Code](https://github.com/sokrypton/ColabFold) - 3Dmol.js: Rego & Koes 📄[Paper](https://academic.oup.com/bioinformatics/article/31/8/1322/213186) 💻 [Code](https://github.com/3dmol/3Dmol.js) Created by [@simonduerr](https://twitter.com/simonduerr) """ ) #seqChoice.change(fn=update_seqs, inputs=seqChoice, outputs=chosenSeq) btn2.click(fn=update, inputs=chosenSeq, outputs=[mol, plot, meanpLDDT]) ray.init(runtime_env={"working_dir": "."}) proteindream.launch(share=False)