import gradio as gr import transformers from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel import torch import torch.nn as nn import pandas as pd import matplotlib.pyplot as plt import io import base64 # Assuming label_to_int is a dictionary with {label_name: label_index} label_to_int = pd.read_pickle('label_to_int.pkl') int_to_label = {v: k for k, v in label_to_int.items()} class LogisticRegressionTorch(nn.Module): def __init__(self, input_dim: int, output_dim: int): super(LogisticRegressionTorch, self).__init__() self.batch_norm = nn.BatchNorm1d(num_features=input_dim) self.linear = nn.Linear(input_dim, output_dim) def forward(self, x): x = self.batch_norm(x) out = self.linear(x) return out class BertClassifier(nn.Module): def __init__(self, bert_model: AutoModel, classifier: LogisticRegressionTorch, num_labels: int): super(BertClassifier, self).__init__() self.bert = bert_model # Assume bert_model is an instance of a pre-trained BertModel self.classifier = classifier self.num_labels = num_labels def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, token_type_ids: torch.Tensor = None, labels: torch.Tensor = None): # Extract outputs from the BERT model outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True) # Take the hidden states from the last layer and extract the hidden state of the first token for each element in the batch pooled_output = outputs.hidden_states[-1][:, 0, :] assert pooled_output.shape == (input_ids.shape[0], 768), f"Expected shape ({input_ids.shape[0]}, 768), but got {pooled_output.shape}" # to-do later! # Pass the pooled output to the classifier to get the logits logits = self.classifier(pooled_output) # Compute loss if labels are provided (assuming using CrossEntropyLoss for classification) loss = None if labels is not None: loss_fct = nn.CrossEntropyLoss() pred = logits.view(-1, self.num_labels) observed = labels.view(-1) loss = loss_fct(pred, observed) # Return the loss and logits return loss, logits # Load the Hugging Face model and tokenizer metadata_features = 0 N_UNIQUE_CLASSES = 38 base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True) # Initialize the classifier input_size = 768 + metadata_features # featurizer output size + metadata size log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES) # Load Weights import os # Get the model path from the environment variable model_weights_path = os.getenv('MODEL_PATH') weights = torch.load(model_weights_path, map_location=torch.device('cpu')) base_model.load_state_dict(weights['model_state_dict']) log_reg.load_state_dict(weights['log_reg_state_dict']) # Creating Model model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES) model.eval() def analyze_dna(sequence): try: # Check if the sequence contains only valid characters if not all(nucleotide in 'ACTGN' for nucleotide in sequence): raise ValueError("Sequence contains invalid characters") # Check if the sequence is at least 300 nucleotides long if len(sequence) < 300: raise ValueError("Sequence needs to be at least 300 nucleotides long") # Preprocess the input sequence inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False) # Get model predictions _, logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask']) # Convert logits to probabilities probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist() # Get the top 5 most likely classes top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5] top_5_probs = [probabilities[i] for i in top_5_indices] # Map indices to label names top_5_labels = [int_to_label[i] for i in top_5_indices] # Prepare the output as a list of tuples (label_name, probability) result = [(label, prob) for label, prob in zip(top_5_labels, top_5_probs)] # Plot histogram fig, ax = plt.subplots(figsize=(10, 6)) ax.barh(top_5_labels, top_5_probs, color='skyblue') ax.set_xlabel('Probability') ax.set_title('Top 5 Most Likely Labels') plt.gca().invert_yaxis() # Highest probabilities at the top # Save plot to a PNG image in memory buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) image_base64 = base64.b64encode(buf.read()).decode('utf-8') buf.close() return result, f'' except ValueError as e: # Return the error message return str(e), "" # Create a Gradio interface demo = gr.Interface(fn=analyze_dna, inputs="text", outputs=["json", "html"]) # Launch the interface demo.launch()