Spaces:

nambiar4
/

DR-BERT

Sleeping

File size: 1,711 Bytes

0ae121b
 
 
 
 
19ff66f
 
0ae121b
 
 
 
 
f050150
 
 
0ae121b
db25cc4
fd2a35c
 
 
 
0ae121b
 
 
 
19ff66f
 
 
77d468f
 
 
 
19ff66f
fd2a35c
cc2c186
0ae121b
fd2a35c
0ae121b
 
 
 
 
cc2c186
f050150
0805962
15a14e4
0ae121b

import gradio as gr
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification
import torch
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt


tokenizer = AutoTokenizer.from_pretrained("./checkpoint-final/")
model = AutoModelForTokenClassification.from_pretrained("./checkpoint-final/")
model = model.eval()

examples = [
    ["GSHMSDNEDNFDGDDFDDVEEDEGLDDLENAEEEGQENVEILPSGERPQANQKRITTPYMTKYERARVLGTRALQIAMCAPVMVELEGETDPLLIAMKELKARKIPIIIRRYLPDGSYEDWGVDELIITD"]]

def get_out(sent):
    prefix = ""
    if len(sent)>1022:
        sent =  sent[:1022]
        prefix = "Your protein was longer than 1022 AAs. We are working on including longer sequences but in the meantime, here are the scores for the first 1022 AAs: \n "
    print(sent)
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    with torch.no_grad():
        output = model(**encoded)
    output = F.softmax(torch.squeeze(output['logits']))[1:-1,1].detach().numpy()

    fig = plt.figure()
    plt.plot(output)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.xlabel('Sequence position', fontsize=15)
    plt.ylabel('DR-BERT score', fontsize=15)
    
    output = ','.join(str(x) for x in output)
    return (fig,prefix+output)


gr.Interface(
    get_out,
    [
        gr.components.Textbox(label="Input Amino Acid Sequence", placeholder = " Amino acid sequence here ...")
    ],
    ["plot","text"],
    examples=examples,
    title="DR-BERT: A Protein Language Model to Predict Disordered Regions",
    description="The app uses DR-BERT to predict disordered regions in proteins. Outputs generated are the probability that a residue is disordered."
).launch()