|
import tqdm |
|
from Bio import SeqIO |
|
import numpy as np |
|
import pandas as pd |
|
import tensorflow as tf |
|
import os |
|
import json |
|
from typing import Dict |
|
from collections import Counter |
|
import random |
|
import obonet |
|
from transformers import T5Tokenizer, T5EncoderModel |
|
import torch |
|
import re |
|
import gradio as gr |
|
|
|
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False) |
|
|
|
|
|
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device) |
|
|
|
def get_embeddings(seq): |
|
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", seq)))] |
|
|
|
ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest") |
|
|
|
input_ids = torch.tensor(ids['input_ids']).to(device) |
|
attention_mask = torch.tensor(ids['attention_mask']).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
embedding_repr = model(input_ids=input_ids, |
|
attention_mask=attention_mask) |
|
|
|
|
|
emb_0 = embedding_repr.last_hidden_state[0] |
|
emb_0_per_protein = emb_0.mean(dim=0) |
|
|
|
return emb_0_per_protein |
|
|
|
def predict(filepath): |
|
sequences = SeqIO.parse(filepath, "fasta") |
|
|
|
ids = [] |
|
num_sequences=sum(1 for seq in sequences) |
|
embeds = np.zeros((num_sequences, 1024)) |
|
i = 0 |
|
with open(filepath, "r") as fasta_file: |
|
|
|
for sequence in SeqIO.parse(fasta_file, "fasta"): |
|
|
|
seq_id = sequence.id |
|
seq_data = str(sequence.seq) |
|
embeds[i] = get_embeddings(seq_data).detach().cpu().numpy() |
|
print(embeds[i]) |
|
ids.append(seq_id) |
|
i += 1 |
|
|
|
INPUT_SHAPE=[1024] |
|
num_of_labels=1500 |
|
|
|
model = tf.keras.Sequential([ |
|
tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE), |
|
tf.keras.layers.Dense(units=512, activation='relu'), |
|
tf.keras.layers.Dropout(0.2), |
|
tf.keras.layers.Dense(units=512, activation='relu'), |
|
tf.keras.layers.Dropout(0.2), |
|
tf.keras.layers.Dense(units=512, activation='relu'), |
|
tf.keras.layers.Dropout(0.2), |
|
tf.keras.layers.Dense(units=num_of_labels, activation='sigmoid') |
|
]) |
|
|
|
model.compile( |
|
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), |
|
loss='binary_crossentropy', |
|
metrics=['binary_accuracy', tf.keras.metrics.AUC()] |
|
) |
|
|
|
model.load_weights('./my_model.weights.h5') |
|
labels_df=pd.read_csv('./labels.csv') |
|
labels_df=labels_df.drop(columns='Unnamed: 0') |
|
|
|
predictions = model.predict(embeds) |
|
predictions_list1=[] |
|
predictions_list2=[] |
|
|
|
|
|
|
|
for prediction in predictions: |
|
tmp=[] |
|
t2=[] |
|
for i in prediction: |
|
x=0 if i<0.4 else 1 |
|
tmp.append(x) |
|
t2.append(i) |
|
predictions_list1.append(tmp.copy()) |
|
predictions_list2.append(t2.copy()) |
|
|
|
label_columns = labels_df.columns |
|
|
|
|
|
predictions_df = pd.DataFrame(predictions_list1, columns=label_columns) |
|
p21=pd.DataFrame(predictions_list2, columns=label_columns) |
|
|
|
|
|
predictions_df.to_csv("predictions.csv", index=False) |
|
p21.to_csv("decimal.csv",index=False) |
|
return "predictions.csv" |
|
|
|
gr.Interface( |
|
predict, |
|
title = 'Protein Function Prediction using fasta file,upload a fasta file', |
|
inputs="file", |
|
outputs="file" |
|
).launch() |