Spaces:
Sleeping
Sleeping
import gradio as gr | |
import transformers | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel | |
import torch | |
import torch.nn as nn | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import io | |
import base64 | |
# Assuming label_to_int is a dictionary with {label_name: label_index} | |
label_to_int = pd.read_pickle('label_to_int.pkl') | |
int_to_label = {v: k for k, v in label_to_int.items()} | |
class LogisticRegressionTorch(nn.Module): | |
def __init__(self, input_dim: int, output_dim: int): | |
super(LogisticRegressionTorch, self).__init__() | |
self.batch_norm = nn.BatchNorm1d(num_features=input_dim) | |
self.linear = nn.Linear(input_dim, output_dim) | |
def forward(self, x): | |
x = self.batch_norm(x) | |
out = self.linear(x) | |
return out | |
class BertClassifier(nn.Module): | |
def __init__(self, bert_model: AutoModel, classifier: LogisticRegressionTorch, num_labels: int): | |
super(BertClassifier, self).__init__() | |
self.bert = bert_model # Assume bert_model is an instance of a pre-trained BertModel | |
self.classifier = classifier | |
self.num_labels = num_labels | |
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, | |
token_type_ids: torch.Tensor = None, labels: torch.Tensor = None): | |
# Extract outputs from the BERT model | |
outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True) | |
# Take the hidden states from the last layer and extract the hidden state of the first token for each element in the batch | |
pooled_output = outputs.hidden_states[-1][:, 0, :] | |
assert pooled_output.shape == (input_ids.shape[0], 768), f"Expected shape ({input_ids.shape[0]}, 768), but got {pooled_output.shape}" | |
# to-do later! | |
# Pass the pooled output to the classifier to get the logits | |
logits = self.classifier(pooled_output) | |
# Compute loss if labels are provided (assuming using CrossEntropyLoss for classification) | |
loss = None | |
if labels is not None: | |
loss_fct = nn.CrossEntropyLoss() | |
pred = logits.view(-1, self.num_labels) | |
observed = labels.view(-1) | |
loss = loss_fct(pred, observed) | |
# Return the loss and logits | |
return loss, logits | |
# Load the Hugging Face model and tokenizer | |
metadata_features = 0 | |
N_UNIQUE_CLASSES = 38 | |
base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True) | |
tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True) | |
# Initialize the classifier | |
input_size = 768 + metadata_features # featurizer output size + metadata size | |
log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES) | |
# Load Weights | |
import os | |
# Get the model path from the environment variable | |
model_weights_path = os.getenv('MODEL_PATH') | |
weights = torch.load(model_weights_path, map_location=torch.device('cpu')) | |
base_model.load_state_dict(weights['model_state_dict']) | |
log_reg.load_state_dict(weights['log_reg_state_dict']) | |
# Creating Model | |
model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES) | |
model.eval() | |
def analyze_dna(sequence): | |
try: | |
# Check if the sequence contains only valid characters | |
if not all(nucleotide in 'ACTGN' for nucleotide in sequence): | |
raise ValueError("Sequence contains invalid characters") | |
# Check if the sequence is at least 300 nucleotides long | |
if len(sequence) < 300: | |
raise ValueError("Sequence needs to be at least 300 nucleotides long") | |
# Preprocess the input sequence | |
inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False) | |
# Get model predictions | |
_, logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) | |
# Convert logits to probabilities | |
probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist() | |
# Get the top 5 most likely classes | |
top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5] | |
top_5_probs = [probabilities[i] for i in top_5_indices] | |
# Map indices to label names | |
top_5_labels = [int_to_label[i] for i in top_5_indices] | |
# Prepare the output as a list of tuples (label_name, probability) | |
result = [(label, prob) for label, prob in zip(top_5_labels, top_5_probs)] | |
# Plot histogram | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
ax.barh(top_5_labels, top_5_probs, color='skyblue') | |
ax.set_xlabel('Probability') | |
ax.set_title('Top 5 Most Likely Labels') | |
plt.gca().invert_yaxis() # Highest probabilities at the top | |
# Save plot to a PNG image in memory | |
buf = io.BytesIO() | |
plt.savefig(buf, format='png') | |
buf.seek(0) | |
image_base64 = base64.b64encode(buf.read()).decode('utf-8') | |
buf.close() | |
return result, f'<img src="data:image/png;base64,{image_base64}" />' | |
except ValueError as e: | |
# Return the error message | |
return str(e), "" | |
# Create a Gradio interface | |
demo = gr.Interface(fn=analyze_dna, inputs="text", outputs=["json", "html"]) | |
# Launch the interface | |
demo.launch() | |