Spaces:

abondrn
/

SVM

Sleeping

App Files Files Community

abondrn commited on Jun 4, 2023

Commit

4d84fae

1 Parent(s): bc4ccb5

Added msa.py utils

Browse files

Files changed (2) hide show

app.py +6 -1
msa.py +62 -0

app.py CHANGED Viewed

@@ -64,12 +64,17 @@ def msa_embed(msa):
     msa_transformer_batch_labels, msa_transformer_batch_strs, msa_transformer_batch_tokens = msa_transformer_batch_converter([inputs])
     msa_transformer_batch_tokens = msa_transformer_batch_tokens.to(next(msa_transformer.parameters()).device)
-    temp = msa_transformer(msa_transformer_batch_tokens,repr_layers=[12])['representations']
     temp = temp[12][:,:,0,:]
     temp = torch.mean(temp,(0,1))
     return temp
 def download_data_if_required():
     url_base = f"https://zenodo.org/record/{pg.zenodo_record}/files"
     fps = [pg.trained_model_fp]

     msa_transformer_batch_labels, msa_transformer_batch_strs, msa_transformer_batch_tokens = msa_transformer_batch_converter([inputs])
     msa_transformer_batch_tokens = msa_transformer_batch_tokens.to(next(msa_transformer.parameters()).device)
+    with torch.no_grad():
+        temp = msa_transformer(msa_transformer_batch_tokens,repr_layers=[12])['representations']
     temp = temp[12][:,:,0,:]
     temp = torch.mean(temp,(0,1))
     return temp
+def go_embed(terms):
+    pass
 def download_data_if_required():
     url_base = f"https://zenodo.org/record/{pg.zenodo_record}/files"
     fps = [pg.trained_model_fp]

msa.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import glob
+import itertools
+from pathlib import Path
+from typing import List, Tuple, Optional, Dict, NamedTuple, Union, Callable
+import string
+import numpy as np
+import torch
+from scipy.spatial.distance import squareform, pdist, cdist
+from Bio import SeqIO
+#import biotite.structure as bs
+#from biotite.structure.io.pdbx import PDBxFile, get_structure
+#from biotite.database import rcsb
+from tqdm import tqdm
+import pandas as pd
+# This is an efficient way to delete lowercase characters and insertion characters from a string
+deletekeys = dict.fromkeys(string.ascii_lowercase)
+deletekeys["."] = None
+deletekeys["*"] = None
+translation = str.maketrans(deletekeys)
+def read_sequence(filename: str) -> Tuple[str, str]:
+    """ Reads the first (reference) sequences from a fasta or MSA file."""
+    record = next(SeqIO.parse(filename, "fasta"))
+    return record.description, str(record.seq)
+def remove_insertions(sequence: str) -> str:
+    """ Removes any insertions into the sequence. Needed to load aligned sequences in an MSA. """
+    return sequence.translate(translation)
+def read_msa(filename: str) -> List[Tuple[str, str]]:
+    """ Reads the sequences from an MSA file, automatically removes insertions."""
+    return [(record.description, remove_insertions(str(record.seq))) for record in SeqIO.parse(filename, "fasta")]
+def greedy_select(msa: List[Tuple[str, str]], num_seqs: int, mode: str = "max") -> List[Tuple[str, str]]:
+    """
+    Select sequences from the MSA to maximize the hamming distance
+    Alternatively, can use hhfilter
+    """
+    assert mode in ("max", "min")
+    if len(msa) <= num_seqs:
+        return msa
+    array = np.array([list(seq) for _, seq in msa], dtype=np.bytes_).view(np.uint8)
+    optfunc = np.argmax if mode == "max" else np.argmin
+    all_indices = np.arange(len(msa))
+    indices = [0]
+    pairwise_distances = np.zeros((0, len(msa)))
+    for _ in range(num_seqs - 1):
+        dist = cdist(array[indices[-1:]], array, "hamming")
+        pairwise_distances = np.concatenate([pairwise_distances, dist])
+        shifted_distance = np.delete(pairwise_distances, indices, axis=1).mean(0)
+        shifted_index = optfunc(shifted_distance)
+        index = np.delete(all_indices, indices)[shifted_index]
+        indices.append(index)
+    indices = sorted(indices)
+    return [msa[idx] for idx in indices]