Spaces:

mawairon
/

NOOTestspace

Sleeping

App Files Files Community

mawairon commited on Jun 28, 2024

Commit

9bf3f2b

•

1 Parent(s): a91155d

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -65

app.py CHANGED Viewed

@@ -7,13 +7,13 @@ import pandas as pd
 import matplotlib.pyplot as plt
 import io
 import base64
-# Assuming label_to_int is a dictionary with {label_name: label_index}
 label_to_int = pd.read_pickle('label_to_int.pkl')
 int_to_label = {v: k for k, v in label_to_int.items()}
 class LogisticRegressionTorch(nn.Module):
     def __init__(self, input_dim: int, output_dim: int):
         super(LogisticRegressionTorch, self).__init__()
         self.batch_norm = nn.BatchNorm1d(num_features=input_dim)
@@ -25,102 +25,64 @@ class LogisticRegressionTorch(nn.Module):
         return out
 class BertClassifier(nn.Module):
     def __init__(self, bert_model: AutoModel, classifier: LogisticRegressionTorch, num_labels: int):
         super(BertClassifier, self).__init__()
-        self.bert = bert_model  # Assume bert_model is an instance of a pre-trained BertModel
         self.classifier = classifier
         self.num_labels = num_labels
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None,
-                token_type_ids: torch.Tensor = None, labels: torch.Tensor = None):
-        # Extract outputs from the BERT model
         outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
-        # Take the hidden states from the last layer and extract the hidden state of the first token for each element in the batch
         pooled_output = outputs.hidden_states[-1][:, 0, :]
-        assert pooled_output.shape == (input_ids.shape[0], 768), f"Expected shape ({input_ids.shape[0]}, 768), but got {pooled_output.shape}"
-        # to-do later!
-        # Pass the pooled output to the classifier to get the logits
         logits = self.classifier(pooled_output)
-        # Compute loss if labels are provided (assuming using CrossEntropyLoss for classification)
-        loss = None
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss()
-            pred = logits.view(-1, self.num_labels)
-            observed = labels.view(-1)
-            loss = loss_fct(pred, observed)
-        # Return the loss and logits
-        return loss, logits
-# Load the Hugging Face model and tokenizer
-metadata_features = 0
-N_UNIQUE_CLASSES = 38
-base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True)
-tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)
-# Initialize the classifier
-input_size = 768 + metadata_features  # featurizer output size + metadata size
-log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES)
-# Load Weights
-import os
-# Get the model path from the environment variable
-model_weights_path = os.getenv('MODEL_PATH')
-weights = torch.load(model_weights_path, map_location=torch.device('cpu'))
-base_model.load_state_dict(weights['model_state_dict'])
-log_reg.load_state_dict(weights['log_reg_state_dict'])
-# Creating Model
-model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES)
-model.eval()
 def analyze_dna(sequence):
     try:
-        # Check if the sequence contains only valid characters
         if not all(nucleotide in 'ACTGN' for nucleotide in sequence):
-            return "Error: Sequence contains invalid characters"
-        # Check if the sequence is at least 300 nucleotides long
         if len(sequence) < 300:
-            return "Error: Sequence needs to be at least 300 nucleotides long"
-        # Preprocess the input sequence
         inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
-        # Get model predictions
-        _, logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
-        # Convert logits to probabilities
         probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
-        # Get the top 5 most likely classes
         top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
         top_5_probs = [probabilities[i] for i in top_5_indices]
-        # Map indices to label names
         top_5_labels = [int_to_label[i] for i in top_5_indices]
-        # Prepare the output as a list of tuples (label_name, probability)
         result = [(label, prob) for label, prob in zip(top_5_labels, top_5_probs)]
-        # Plot histogram
         fig, ax = plt.subplots(figsize=(10, 6))
         ax.barh(top_5_labels, top_5_probs, color='skyblue')
         ax.set_xlabel('Probability')
         ax.set_title('Top 5 Most Likely Labels')
-        plt.gca().invert_yaxis()  # Highest probabilities at the top
-        # Save plot to a PNG image in memory
         buf = io.BytesIO()
         plt.savefig(buf, format='png')
         buf.seek(0)
@@ -129,10 +91,8 @@ def analyze_dna(sequence):
         return result, f'<img src="data:image/png;base64,{image_base64}" />'
-    except ValueError as e:
-        # Return the error message
         return str(e), ""
 # Create a Gradio interface
 demo = gr.Interface(fn=analyze_dna, inputs="text", outputs=["json", "html"])

 import matplotlib.pyplot as plt
 import io
 import base64
+import os
+# Load label mapping
 label_to_int = pd.read_pickle('label_to_int.pkl')
 int_to_label = {v: k for k, v in label_to_int.items()}
 class LogisticRegressionTorch(nn.Module):
     def __init__(self, input_dim: int, output_dim: int):
         super(LogisticRegressionTorch, self).__init__()
         self.batch_norm = nn.BatchNorm1d(num_features=input_dim)
         return out
 class BertClassifier(nn.Module):
     def __init__(self, bert_model: AutoModel, classifier: LogisticRegressionTorch, num_labels: int):
         super(BertClassifier, self).__init__()
+        self.bert = bert_model
         self.classifier = classifier
         self.num_labels = num_labels
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
         outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
         pooled_output = outputs.hidden_states[-1][:, 0, :]
         logits = self.classifier(pooled_output)
+        return logits
+def load_model():
+    metadata_features = 0
+    N_UNIQUE_CLASSES = 38
+    base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True)
+    tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)
+    input_size = 768 + metadata_features
+    log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES)
+    model_weights_path = os.getenv('MODEL_PATH')
+    weights = torch.load(model_weights_path, map_location=torch.device('cpu'))
+    base_model.load_state_dict(weights['model_state_dict'])
+    log_reg.load_state_dict(weights['log_reg_state_dict'])
+    model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES)
+    model.eval()
+    return model, tokenizer
+model, tokenizer = load_model()
 def analyze_dna(sequence):
     try:
         if not all(nucleotide in 'ACTGN' for nucleotide in sequence):
+            return "Error: Sequence contains invalid characters", ""
         if len(sequence) < 300:
+            return "Error: Sequence needs to be at least 300 nucleotides long", ""
         inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
+        logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
         probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
         top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
         top_5_probs = [probabilities[i] for i in top_5_indices]
         top_5_labels = [int_to_label[i] for i in top_5_indices]
         result = [(label, prob) for label, prob in zip(top_5_labels, top_5_probs)]
         fig, ax = plt.subplots(figsize=(10, 6))
         ax.barh(top_5_labels, top_5_probs, color='skyblue')
         ax.set_xlabel('Probability')
         ax.set_title('Top 5 Most Likely Labels')
+        plt.gca().invert_yaxis()
         buf = io.BytesIO()
         plt.savefig(buf, format='png')
         buf.seek(0)
         return result, f'<img src="data:image/png;base64,{image_base64}" />'
+    except Exception as e:
         return str(e), ""
 # Create a Gradio interface
 demo = gr.Interface(fn=analyze_dna, inputs="text", outputs=["json", "html"])