Spaces:

mawairon
/

NOOTestspace

Sleeping

App Files Files Community

mawairon commited on Aug 2, 2024

Commit

ace289f

•

1 Parent(s): 1bb2663

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -73

app.py CHANGED Viewed

@@ -10,6 +10,10 @@ import base64
 import os
 import huggingface_hub
 from huggingface_hub import hf_hub_download, login
 # Load label mapping
 label_to_int = pd.read_pickle('label_to_int.pkl')
@@ -24,63 +28,67 @@ for k, v in int_to_label.items():
     elif "RUSSIAN" in v:
         int_to_label[k] = "RUSSIA"
-class LogisticRegressionTorch(nn.Module):
-    def __init__(self, input_dim: int, output_dim: int):
-        super(LogisticRegressionTorch, self).__init__()
-        self.batch_norm = nn.BatchNorm1d(num_features=input_dim)
-        self.linear = nn.Linear(input_dim, output_dim)
-    def forward(self, x):
-        x = self.batch_norm(x)
-        out = self.linear(x)
-        return out
-class BertClassifier(nn.Module):
-    def __init__(self, bert_model: AutoModel, classifier: LogisticRegressionTorch, num_labels: int):
-        super(BertClassifier, self).__init__()
-        self.bert = bert_model
-        self.classifier = classifier
-        self.num_labels = num_labels
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
-        outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
-        pooled_output = outputs.hidden_states[-1][:, 0, :]
-        logits = self.classifier(pooled_output)
-        return logits
-def load_model():
-    metadata_features = 0
-    N_UNIQUE_CLASSES = 38
-    base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True)
-    tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)
-    input_size = 768 + metadata_features
-    log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES)
-    token = os.getenv('HUGGINGFACE_TOKEN')
-    if token is None:
-        raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
-    login(token=token)
-    file_path = hf_hub_download(
-        repo_id="mawairon/noo_test",
-        filename="gena-blastln-bs33-lr4e-05-S168.pth",
-        use_auth_token=token
-    )
-    weights = torch.load(file_path, map_location=torch.device('cpu'))
-    base_model.load_state_dict(weights['model_state_dict'])
-    log_reg.load_state_dict(weights['log_reg_state_dict'])
-    model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES)
-    model.eval()
-    return model, tokenizer
-model, tokenizer = load_model()
-def analyze_dna(username, password, sequence):
     valid_usernames = os.getenv('USERNAME').split(',')
     env_password = os.getenv('PASSWORD')
@@ -89,6 +97,7 @@ def analyze_dna(username, password, sequence):
         return {"error": "Invalid username or password"}, ""
     try:
         # Remove all whitespace characters
         sequence = sequence.replace(" ", "").replace("\n", "").replace("\t", "").replace("\r", "")
@@ -98,25 +107,43 @@ def analyze_dna(username, password, sequence):
         if len(sequence) < 300:
             return {"error": "Sequence needs to be at least 300 nucleotides long"}, ""
-        def get_logits(seq):
-            inputs = tokenizer(seq, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
-            with torch.no_grad():
-                logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
-            return logits
-        if len(sequence) > 3000:
-            num_shifts = len(sequence) // 1000
-            logits_sum = None
-            for i in range(num_shifts):
-                shifted_sequence = sequence[i*1000:] + sequence[:i*1000]
-                logits = get_logits(shifted_sequence)
-                if logits_sum is None:
-                    logits_sum = logits
-                else:
-                    logits_sum += logits
-            logits_avg = logits_sum / num_shifts
-        else:
-            logits_avg = get_logits(sequence)
         probabilities = torch.nn.functional.softmax(logits_avg, dim=-1).squeeze().tolist()
         top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
@@ -147,9 +174,13 @@ demo = gr.Interface(
     inputs=[
         gr.Textbox(label="Username"),
         gr.Textbox(label="Password", type="password"),
-        gr.Textbox(label="DNA Sequence")
     ],
-    outputs=["json", "html"]
 )
 # Launch the interface

 import os
 import huggingface_hub
 from huggingface_hub import hf_hub_download, login
+import model_archs
+from model_archs import BertClassifier, LogisticRegressionTorch, SimpleCNN, MLP, Pool2BN
+import tangermeme
+from tangermeme import one_hot_encode
 # Load label mapping
 label_to_int = pd.read_pickle('label_to_int.pkl')
     elif "RUSSIAN" in v:
         int_to_label[k] = "RUSSIA"
+def load_model(model_name: str):
+    metadata_features = 0
+    N_UNIQUE_CLASSES = 38
+    if model_name == 'gena-bert':
+        base_model = AutoModel.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True, output_hidden_states=True)
+        tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-lastln-t2t', trust_remote_code=True)
+        input_size = 768 + metadata_features
+        log_reg = LogisticRegressionTorch(input_dim=input_size, output_dim=N_UNIQUE_CLASSES)
+        token = os.getenv('HUGGINGFACE_TOKEN')
+        if token is None:
+            raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
+        login(token=token)
+        file_path = hf_hub_download(
+            repo_id="mawairon/noo_test",
+            filename="gena-blastln-bs33-lr4e-05-S168.pth",
+            use_auth_token=token
+        )
+        weights = torch.load(file_path, map_location=torch.device('cpu'))
+        base_model.load_state_dict(weights['model_state_dict'])
+        log_reg.load_state_dict(weights['log_reg_state_dict'])
+        model = BertClassifier(base_model, log_reg, num_labels=N_UNIQUE_CLASSES)
+        model.eval()
+        return model, tokenizer
+    elif model_name == 'CNN':
+        hidden_dim = 2048
+        width = 2048
+        seq_drop_prob = 0.05
+        train_sequence_length = 8000
+        weight_decay = 0.0001
+        num_labs = len(set(y_train))
+        model_seq = SimpleCNN(18, hidden_dim, additional_layer=False)
+        new_head = torch.nn.Sequential(
+                torch.nn.Dropout(0.5),
+                MLP([hidden_dim*2 , num_labs])
+            )
+        model = torch.nn.Sequential(
+            model_seq,
+            new_head
+        )
+        return model, None
+    else:
+        return {"error": "Invalid model name"}
+def analyze_dna(username, password, sequence, model_name):
     valid_usernames = os.getenv('USERNAME').split(',')
     env_password = os.getenv('PASSWORD')
         return {"error": "Invalid username or password"}, ""
     try:
         # Remove all whitespace characters
         sequence = sequence.replace(" ", "").replace("\n", "").replace("\t", "").replace("\r", "")
         if len(sequence) < 300:
             return {"error": "Sequence needs to be at least 300 nucleotides long"}, ""
+        model, tokenizer = load_model(model_name)
+        def get_logits(seq, model_name):
+            if model_name == 'gena-bert':
+                inputs = tokenizer(seq, truncation=True, padding='max_length', max_length=512, return_tensors="pt", return_token_type_ids=False)
+                with torch.no_grad():
+                    logits = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
+                return logits
+            elif model_name == 'CNN':
+                # Truncate sequence
+                SEQUENCE_LENGTH = 8000
+                seq = seq[:SEQUENCE_LENGTH]
+                # Pad sequences to the desired length
+                seq = seq.ljust(length, pad_char)[:SEQUENCE_LENGTH]
+                # Apply one-hot encoding to the 'sequence' column
+                input = seq.one_hot_encode()
+                with torch.no_grad():
+                    logits = model(input)
+                return logits
+        # if (len(sequence) > 3000 and model_name == 'gena-bert') or (len(sequence) > 10000 and model_name == 'CNN'):
+        #     num_shifts = len(sequence) // 1000
+        #     logits_sum = None
+        #     for i in range(num_shifts):
+        #         shifted_sequence = sequence[i*1000:] + sequence[:i*1000]
+        #         logits = get_logits(shifted_sequence)
+        #         if logits_sum is None:
+        #             logits_sum = logits
+        #         else:
+        #             logits_sum += logits
+        #     logits_avg = logits_sum / num_shifts
+        # else:
+        logits_avg = get_logits(sequence)
         probabilities = torch.nn.functional.softmax(logits_avg, dim=-1).squeeze().tolist()
         top_5_indices = sorted(range(len(probabilities)), key=lambda i: probabilities[i], reverse=True)[:5]
     inputs=[
         gr.Textbox(label="Username"),
         gr.Textbox(label="Password", type="password"),
+        gr.Textbox(label="DNA Sequence"),
+        gr.Dropdown(label="Model", choices=[
+            "gena-bert",
+            "CNN"
+        ])
     ],
+    outputs=["json", "HTML"]
 )
 # Launch the interface