Spaces:

mawairon
/

NOOTestspace

Sleeping

App Files Files Community

mawairon commited on Jun 26, 2024

Commit

996a1ec

•

1 Parent(s): 513b115

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -19

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import transformers
-from transformers import AutoTokenizer, AutoModel, BertModel, BertForMaskedLM, BertTokenizer
 import torch
 import torch.nn as nn
@@ -12,11 +12,10 @@ class LogisticRegressionTorch(nn.Module):
                  output_dim: int):
         super(LogisticRegressionTorch, self).__init__()
-        self.batch_norm = nn.BatchNorm1d(num_features = input_dim)
         self.linear = nn.Linear(input_dim, output_dim)
     def forward(self, x):
         x = self.batch_norm(x)
         out = self.linear(x)
         return out
@@ -24,7 +23,7 @@ class LogisticRegressionTorch(nn.Module):
 class BertClassifier(nn.Module):
     def __init__(self,
-                 bert_model: BertForMaskedLM,
                  classifier: LogisticRegressionTorch,
                  num_labels: int):
@@ -34,13 +33,10 @@ class BertClassifier(nn.Module):
         self.num_labels = num_labels
     def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None,
-                token_type_ids: torch.Tensor = None, labels: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
         # Extract outputs from the BERT model
-        outputs = self.bert(input_ids, attention_mask=attention_mask)
-        # Sanity check for outputs
-        assert 'hidden_states' in outputs, "BERT model output does not contain 'hidden_states'."
         # Take the hidden states from the last layer and extract the hidden state of the first token for each element in the batch
         pooled_output = outputs.hidden_states[-1][:, 0, :]
@@ -54,12 +50,10 @@ class BertClassifier(nn.Module):
         loss = None
         if labels is not None:
             loss_fct = nn.CrossEntropyLoss()
             pred = logits.view(-1, self.num_labels)
             observed = labels.view(-1)
             loss = loss_fct(pred, observed)
-            #assert loss_fct(float(observed), observed) < 1e-6
         # Return the loss and logits
         return loss, logits
@@ -71,8 +65,7 @@ class BertClassifier(nn.Module):
 metadata_features = 0
 N_UNIQUE_CLASSES = 38 ## or 38
-base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states = True)
 tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)
 # Initialize the classifier
@@ -80,14 +73,14 @@ input_size = 768 + metadata_features # featurizer output size + metadata size
 log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES)
 # Load Weights
-model_weights_path = 'gena-blastln-bs33-lr4e-05-S168.pth'
 weights = torch.load(model_weights_path, map_location=torch.device('cpu'))
 base_model.load_state_dict(weights['model_state_dict'])
 log_reg.load_state_dict(weights['log_reg_state_dict'])
 # Creating Model
-model = BertClassifier(base_model, log_reg, num_labels = N_UNIQUE_CLASSES)
 # Define a function to process the DNA sequence
@@ -96,10 +89,10 @@ def analyze_dna(sequence):
     inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
     # Get model predictions
-    _, outputs = model(**inputs)
     # Convert logits to probabilities
-    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1).squeeze().tolist()
     # Get the top 5 most likely classes
     top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
@@ -108,7 +101,7 @@ def analyze_dna(sequence):
     # Prepare the output as a list of tuples (class_index, probability)
     result = [(index, prob) for index, prob in zip(top_5_indices, top_5_probs)]
-    return probabilities
 # Create a Gradio interface
 demo = gr.Interface(fn=analyze_dna, inputs="text", outputs="json")

 import gradio as gr
 import transformers
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
 import torch
 import torch.nn as nn
                  output_dim: int):
         super(LogisticRegressionTorch, self).__init__()
+        self.batch_norm = nn.BatchNorm1d(num_features=input_dim)
         self.linear = nn.Linear(input_dim, output_dim)
     def forward(self, x):
         x = self.batch_norm(x)
         out = self.linear(x)
         return out
 class BertClassifier(nn.Module):
     def __init__(self,
+                 bert_model: AutoModel,
                  classifier: LogisticRegressionTorch,
                  num_labels: int):
         self.num_labels = num_labels
     def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None,
+                token_type_ids: torch.Tensor = None, labels: torch.Tensor = None):
         # Extract outputs from the BERT model
+        outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
         # Take the hidden states from the last layer and extract the hidden state of the first token for each element in the batch
         pooled_output = outputs.hidden_states[-1][:, 0, :]
         loss = None
         if labels is not None:
             loss_fct = nn.CrossEntropyLoss()
             pred = logits.view(-1, self.num_labels)
             observed = labels.view(-1)
             loss = loss_fct(pred, observed)
         # Return the loss and logits
         return loss, logits
 metadata_features = 0
 N_UNIQUE_CLASSES = 38 ## or 38
+base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True)
 tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)
 # Initialize the classifier
 log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES)
 # Load Weights
+model_weights_path = 'model/gena-blastln-bs33-lr4e-05-S168.pth'
 weights = torch.load(model_weights_path, map_location=torch.device('cpu'))
 base_model.load_state_dict(weights['model_state_dict'])
 log_reg.load_state_dict(weights['log_reg_state_dict'])
 # Creating Model
+model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES)
 # Define a function to process the DNA sequence
     inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
     # Get model predictions
+    _, logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
     # Convert logits to probabilities
+    probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
     # Get the top 5 most likely classes
     top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
     # Prepare the output as a list of tuples (class_index, probability)
     result = [(index, prob) for index, prob in zip(top_5_indices, top_5_probs)]
+    return result
 # Create a Gradio interface
 demo = gr.Interface(fn=analyze_dna, inputs="text", outputs="json")