Spaces:

sagawa
/

PLTNUM

Sleeping

File size: 6,207 Bytes

import gradio as gr
import sys
import random
import os
import pandas as pd
import torch
import itertools
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

sys.path.append("scripts/")
from foldseek_util import get_struc_seq
from utils import seed_everything
from models import PLTNUM_PreTrainedModel
from datasets_ import PLTNUMDataset


class Config:
    def __init__(self):
        self.batch_size = 2
        self.use_amp = False
        self.num_workers = 1
        self.max_length = 512
        self.used_sequence = "left"
        self.padding_side = "right"
        self.task = "classification"
        self.sequence_col = "sequence"
        self.seed = 42


def predict_stability(model_choice, organism_choice, pdb_file=None, sequence=None, cfg=Config()):
    if pdb_file:
        pdb_path = pdb_file.name  # Get the path of the uploaded PDB file
        os.system("chmod 777 bin/foldseek")
        sequences = get_foldseek_seq(pdb_path)
        if not sequences:
            return "Failed to extract sequence from the PDB file."
        sequence = sequences[2] if model_choice == "SaProt" else sequences[0]

    if not sequence:
        return "No valid input provided."

    cell_line = "HeLa" if organism_choice == "Human" else "NIH3T3"
    cfg.model = f"sagawa/PLTNUM-{model_choice}-{cell_line}"
    cfg.architecture = model_choice
    cfg.model_path = f"sagawa/PLTNUM-{model_choice}-{cell_line}"
    
    output = predict(cfg, sequence)
    return output



def get_foldseek_seq(pdb_path):
    parsed_seqs = get_struc_seq(
        "bin/foldseek",
        pdb_path,
        ["A"],
        process_id=random.randint(0, 10000000),
    )["A"]
    return parsed_seqs


def predict(cfg, sequence):
    cfg.token_length = 2 if cfg.architecture == "SaProt" else 1
    cfg.device = "cuda" if torch.cuda.is_available() else "cpu"

    if cfg.used_sequence == "both":
        cfg.max_length += 1

    seed_everything(cfg.seed)
    df = pd.DataFrame({cfg.sequence_col: [sequence]})

    tokenizer = AutoTokenizer.from_pretrained(
        cfg.model_path, padding_side=cfg.padding_side
    )
    cfg.tokenizer = tokenizer
    dataset = PLTNUMDataset(cfg, df, train=False)
    dataloader = DataLoader(
        dataset,
        batch_size=cfg.batch_size,
        shuffle=False,
        num_workers=cfg.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    model = PLTNUM_PreTrainedModel.from_pretrained(cfg.model_path, cfg=cfg)
    model.to(cfg.device)

    model.eval()
    predictions = []

    for inputs, _ in dataloader:
        inputs = inputs.to(cfg.device)
        with torch.no_grad():
            with torch.amp.autocast(cfg.device, enabled=cfg.use_amp):
                preds = (
                    torch.sigmoid(model(inputs))
                    if cfg.task == "classification"
                    else model(inputs)
                )
        predictions += preds.cpu().tolist()

    predictions = list(itertools.chain.from_iterable(predictions))
    outputs = {
        "raw prediction values": predictions,
        "binary prediction values": [1 if x > 0.5 else 0 for x in predictions]
    }

    html_output = f"""
    <div style='border: 2px solid #4CAF50; padding: 10px; border-radius: 10px;'>
        <p><strong>Raw prediction value:</strong> {outputs['raw prediction values'][0]}</p>
        <p><strong>Binary prediction values:</strong> {outputs['binary prediction values'][0]}</p>
    </div>
    """
    
    return html_output


# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # PLTNUM: Protein LifeTime Neural Model
        **Predict the protein half-life from its sequence or PDB file.**
        """
    )

    gr.Image(
        "https://github.com/sagawatatsuya/PLTNUM/blob/main/model-image.png?raw=true",
        label="Model Image",
    )

    # Model and Organism selection in the same row to avoid layout issues
    with gr.Row():
        model_choice = gr.Radio(
            choices=["SaProt", "ESM2"],
            label="Select PLTNUM's base model.",
            value="SaProt",
        )
        organism_choice = gr.Radio(
            choices=["Mouse", "Human"],
            label="Select the target organism.",
            value="Mouse",
        )

    with gr.Tabs():
        with gr.TabItem("Upload PDB File"):
            gr.Markdown("### Upload your PDB file:")
            pdb_file = gr.File(label="Upload PDB File")
            predict_button = gr.Button("Predict Stability")
            prediction_output = gr.HTML(
                label="Stability Prediction"
            )

            predict_button.click(
                fn=predict_stability,
                inputs=[model_choice, organism_choice, pdb_file],
                outputs=prediction_output,
            )

        with gr.TabItem("Enter Protein Sequence"):
            gr.Markdown("### Enter the protein sequence:")
            sequence = gr.Textbox(
                label="Protein Sequence",
                placeholder="Enter your protein sequence here...",
                lines=8,
            )
            predict_button = gr.Button("Predict Stability")
            prediction_output = gr.HTML(
                label="Stability Prediction"
            )

            predict_button.click(
                fn=predict_stability,
                inputs=[model_choice, organism_choice, sequence],
                outputs=prediction_output,
            )

    gr.Markdown(
        """
        ### How to Use:
        - **Select Model**: Choose between 'SaProt' or 'ESM2' for your prediction.
        - **Select Organism**: Choose between 'Mouse' or 'Human'.
        - **Upload PDB File**: Choose the 'Upload PDB File' tab and upload your file.
        - **Enter Sequence**: Alternatively, switch to the 'Enter Protein Sequence' tab and input your sequence.
        - **Predict**: Click 'Predict Stability' to receive the prediction.
        """
    )

    gr.Markdown(
        """
        ### About the Tool
        This tool allows researchers and scientists to predict the stability of proteins using advanced algorithms. It supports both PDB file uploads and direct sequence input.
        """
    )

demo.launch()