Spaces:

mawairon
/

NOOTestspace

Sleeping

App Files Files Community

NOOTestspace / app.py

mawairon

Update app.py

356d0ee verified 6 months ago

raw

history blame

5.46 kB

	import gradio as gr
	import transformers
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
	import torch
	import torch.nn as nn
	import pandas as pd
	import matplotlib.pyplot as plt
	import io
	import base64

	# Assuming label_to_int is a dictionary with {label_name: label_index}
	label_to_int = pd.read_pickle('label_to_int.pkl')
	int_to_label = {v: k for k, v in label_to_int.items()}

	class LogisticRegressionTorch(nn.Module):

	def __init__(self, input_dim: int, output_dim: int):
	super(LogisticRegressionTorch, self).__init__()
	self.batch_norm = nn.BatchNorm1d(num_features=input_dim)
	self.linear = nn.Linear(input_dim, output_dim)

	def forward(self, x):
	x = self.batch_norm(x)
	out = self.linear(x)
	return out

	class BertClassifier(nn.Module):

	def __init__(self, bert_model: AutoModel, classifier: LogisticRegressionTorch, num_labels: int):
	super(BertClassifier, self).__init__()
	self.bert = bert_model # Assume bert_model is an instance of a pre-trained BertModel
	self.classifier = classifier
	self.num_labels = num_labels

	def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None,
	token_type_ids: torch.Tensor = None, labels: torch.Tensor = None):
	# Extract outputs from the BERT model
	outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)

	# Take the hidden states from the last layer and extract the hidden state of the first token for each element in the batch
	pooled_output = outputs.hidden_states[-1][:, 0, :]

	assert pooled_output.shape == (input_ids.shape[0], 768), f"Expected shape ({input_ids.shape[0]}, 768), but got {pooled_output.shape}"
	# to-do later!

	# Pass the pooled output to the classifier to get the logits
	logits = self.classifier(pooled_output)

	# Compute loss if labels are provided (assuming using CrossEntropyLoss for classification)
	loss = None

	if labels is not None:
	loss_fct = nn.CrossEntropyLoss()
	pred = logits.view(-1, self.num_labels)
	observed = labels.view(-1)
	loss = loss_fct(pred, observed)

	# Return the loss and logits
	return loss, logits

	# Load the Hugging Face model and tokenizer

	metadata_features = 0
	N_UNIQUE_CLASSES = 38

	base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True)
	tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)

	# Initialize the classifier
	input_size = 768 + metadata_features # featurizer output size + metadata size
	log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES)

	# Load Weights
	import os

	# Get the model path from the environment variable
	model_weights_path = os.getenv('MODEL_PATH')
	weights = torch.load(model_weights_path, map_location=torch.device('cpu'))

	base_model.load_state_dict(weights['model_state_dict'])
	log_reg.load_state_dict(weights['log_reg_state_dict'])

	# Creating Model
	model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES)
	model.eval()

	def analyze_dna(sequence):
	try:
	# Check if the sequence contains only valid characters
	if not all(nucleotide in 'ACTGN' for nucleotide in sequence):
	raise ValueError("Sequence contains invalid characters")

	# Check if the sequence is at least 300 nucleotides long
	if len(sequence) < 300:
	raise ValueError("Sequence needs to be at least 300 nucleotides long")

	# Preprocess the input sequence
	inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)

	# Get model predictions
	_, logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

	# Convert logits to probabilities
	probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()

	# Get the top 5 most likely classes
	top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
	top_5_probs = [probabilities[i] for i in top_5_indices]

	# Map indices to label names
	top_5_labels = [int_to_label[i] for i in top_5_indices]

	# Prepare the output as a list of tuples (label_name, probability)
	result = [(label, prob) for label, prob in zip(top_5_labels, top_5_probs)]

	# Plot histogram
	fig, ax = plt.subplots(figsize=(10, 6))
	ax.barh(top_5_labels, top_5_probs, color='skyblue')
	ax.set_xlabel('Probability')
	ax.set_title('Top 5 Most Likely Labels')
	plt.gca().invert_yaxis() # Highest probabilities at the top

	# Save plot to a PNG image in memory
	buf = io.BytesIO()
	plt.savefig(buf, format='png')
	buf.seek(0)
	image_base64 = base64.b64encode(buf.read()).decode('utf-8')
	buf.close()

	return result, f'<img src="data:image/png;base64,{image_base64}" />'

	except ValueError as e:
	# Return the error message
	return str(e), ""


	# Create a Gradio interface
	demo = gr.Interface(fn=analyze_dna, inputs="text", outputs=["json", "html"])

	# Launch the interface
	demo.launch()