Spaces:

mawairon
/

NOOTestspace

Sleeping

App Files Files Community

mawairon commited on Jun 28, 2024

Commit

356d0ee

•

1 Parent(s): ba0faf4

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -44

app.py CHANGED Viewed

@@ -61,7 +61,7 @@ class BertClassifier(nn.Module):
 # Load the Hugging Face model and tokenizer
 metadata_features = 0
-N_UNIQUE_CLASSES = 38  # or 38
 base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True)
 tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)
@@ -84,51 +84,55 @@ log_reg.load_state_dict(weights['log_reg_state_dict'])
 model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES)
 model.eval()
-# Define a function to process the DNA sequence
 def analyze_dna(sequence):
-    assert all(nucleotide in 'ACTGN' for nucleotide in sequence), "Sequence contains invalid characters"
-    assert len(sequence) >= 300, "Sequence needs to be at least 300 nucleotides long"
-    # Preprocess the input sequence
-    inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
-    print("Tokenization done.")
-    # Get model predictions
-    _, logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
-    print("Forward pass done.")
-    # Convert logits to probabilities
-    probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
-    print("Probabilities done.")
-    # Get the top 5 most likely classes
-    top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
-    top_5_probs = [probabilities[i] for i in top_5_indices]
-    # Map indices to label names
-    top_5_labels = [int_to_label[i] for i in top_5_indices]
-    # Prepare the output as a list of tuples (label_name, probability)
-    result = [(label, prob) for label, prob in zip(top_5_labels, top_5_probs)]
-    # Plot histogram
-    fig, ax = plt.subplots(figsize=(10, 6))
-    ax.barh(top_5_labels, top_5_probs, color='skyblue')
-    ax.set_xlabel('Probability')
-    ax.set_title('Top 5 Most Likely Labels')
-    plt.gca().invert_yaxis()  # Highest probabilities at the top
-    # Save plot to a PNG image in memory
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png')
-    buf.seek(0)
-    image_base64 = base64.b64encode(buf.read()).decode('utf-8')
-    buf.close()
-    return result, f'<img src="data:image/png;base64,{image_base64}" />'
 # Create a Gradio interface
 demo = gr.Interface(fn=analyze_dna, inputs="text", outputs=["json", "html"])

 # Load the Hugging Face model and tokenizer
 metadata_features = 0
+N_UNIQUE_CLASSES = 38
 base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True)
 tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)
 model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES)
 model.eval()
 def analyze_dna(sequence):
+    try:
+        # Check if the sequence contains only valid characters
+        if not all(nucleotide in 'ACTGN' for nucleotide in sequence):
+            raise ValueError("Sequence contains invalid characters")
+        # Check if the sequence is at least 300 nucleotides long
+        if len(sequence) < 300:
+            raise ValueError("Sequence needs to be at least 300 nucleotides long")
+        # Preprocess the input sequence
+        inputs = tokenizer(sequence, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
+        # Get model predictions
+        _, logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
+        # Convert logits to probabilities
+        probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
+        # Get the top 5 most likely classes
+        top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
+        top_5_probs = [probabilities[i] for i in top_5_indices]
+        # Map indices to label names
+        top_5_labels = [int_to_label[i] for i in top_5_indices]
+        # Prepare the output as a list of tuples (label_name, probability)
+        result = [(label, prob) for label, prob in zip(top_5_labels, top_5_probs)]
+        # Plot histogram
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.barh(top_5_labels, top_5_probs, color='skyblue')
+        ax.set_xlabel('Probability')
+        ax.set_title('Top 5 Most Likely Labels')
+        plt.gca().invert_yaxis()  # Highest probabilities at the top
+        # Save plot to a PNG image in memory
+        buf = io.BytesIO()
+        plt.savefig(buf, format='png')
+        buf.seek(0)
+        image_base64 = base64.b64encode(buf.read()).decode('utf-8')
+        buf.close()
+        return result, f'<img src="data:image/png;base64,{image_base64}" />'
+    except ValueError as e:
+        # Return the error message
+        return str(e), ""
 # Create a Gradio interface
 demo = gr.Interface(fn=analyze_dna, inputs="text", outputs=["json", "html"])