Spaces:

simonduerr
/

diffdock

Runtime error

App Files Files Community

Simon Duerr commited on Oct 9, 2022

Commit

39fd018

1 Parent(s): 948f202

fix req, add esm

Browse files

Files changed (26) hide show

.gitignore +0 -2
esm/esm/__init__.py +12 -0
esm/esm/axial_attention.py +239 -0
esm/esm/constants.py +10 -0
esm/esm/data.py +493 -0
esm/esm/inverse_folding/__init__.py +8 -0
esm/esm/inverse_folding/features.py +352 -0
esm/esm/inverse_folding/gvp_encoder.py +56 -0
esm/esm/inverse_folding/gvp_modules.py +473 -0
esm/esm/inverse_folding/gvp_transformer.py +137 -0
esm/esm/inverse_folding/gvp_transformer_encoder.py +184 -0
esm/esm/inverse_folding/gvp_utils.py +68 -0
esm/esm/inverse_folding/multichain_util.py +151 -0
esm/esm/inverse_folding/transformer_decoder.py +228 -0
esm/esm/inverse_folding/transformer_layer.py +304 -0
esm/esm/inverse_folding/util.py +320 -0
esm/esm/model/esm1.py +200 -0
esm/esm/model/esm2.py +147 -0
esm/esm/model/msa_transformer.py +238 -0
esm/esm/modules.py +418 -0
esm/esm/multihead_attention.py +508 -0
esm/esm/pretrained.py +397 -0
esm/esm/rotary_embedding.py +69 -0
esm/esm/version.py +6 -0
esm/scripts/extract.py +136 -0
requirements.txt +2 -2

.gitignore CHANGED Viewed

@@ -162,5 +162,3 @@ temp4.py
 temp5.py
 temp6.py
 temp7.py
-esm

 temp5.py
 temp6.py
 temp7.py

esm/esm/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .version import version as __version__  # noqa
+from .data import Alphabet, BatchConverter, FastaBatchedDataset  # noqa
+from .model.esm1 import ProteinBertModel  # noqa
+from .model.esm2 import ESM2  # noqa
+from .model.msa_transformer import MSATransformer  #noqa
+from . import pretrained  # noqa

esm/esm/axial_attention.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+class RowSelfAttention(nn.Module):
+    """Compute self-attention over rows of a 2D input."""
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        max_tokens_per_msa: int = 2 ** 16,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.scaling = self.head_dim ** -0.5
+        self.max_tokens_per_msa = max_tokens_per_msa
+        self.attn_shape = "hnij"
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.dropout_module = nn.Dropout(dropout)
+    def align_scaling(self, q):
+        num_rows = q.size(0)
+        return self.scaling / math.sqrt(num_rows)
+    def _batched_forward(
+        self,
+        x,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        max_rows = max(1, self.max_tokens_per_msa // num_cols)
+        attns = 0
+        scaling = self.align_scaling(x)
+        for start in range(0, num_rows, max_rows):
+            attn_weights = self.compute_attention_weights(
+                x[start : start + max_rows],
+                scaling,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask[:, start : start + max_rows]
+                if self_attn_padding_mask is not None
+                else None,
+            )
+            attns += attn_weights
+        attn_probs = attns.softmax(-1)
+        attn_probs = self.dropout_module(attn_probs)
+        outputs = []
+        for start in range(0, num_rows, max_rows):
+            output = self.compute_attention_update(x[start : start + max_rows], attn_probs)
+            outputs.append(output)
+        output = torch.cat(outputs, 0)
+        return output, attn_probs
+    def compute_attention_weights(
+        self,
+        x,
+        scaling: float,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        q = self.q_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+        k = self.k_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+        q *= scaling
+        if self_attn_padding_mask is not None:
+            # Zero out any padded aligned positions - this is important since
+            # we take a sum across the alignment axis.
+            q *= 1 - self_attn_padding_mask.permute(1, 2, 0).unsqueeze(3).unsqueeze(4).to(q)
+        attn_weights = torch.einsum(f"rinhd,rjnhd->{self.attn_shape}", q, k)
+        if self_attn_mask is not None:
+            raise NotImplementedError
+            # Mask Size: [B x R x C], Weights Size: [H x B x C x C]
+        if self_attn_padding_mask is not None:
+            attn_weights = attn_weights.masked_fill(
+                self_attn_padding_mask[:, 0].unsqueeze(0).unsqueeze(2),
+                -10000,
+            )
+        return attn_weights
+    def compute_attention_update(
+        self,
+        x,
+        attn_probs,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        v = self.v_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+        context = torch.einsum(f"{self.attn_shape},rjnhd->rinhd", attn_probs, v)
+        context = context.contiguous().view(num_rows, num_cols, batch_size, embed_dim)
+        output = self.out_proj(context)
+        return output
+    def forward(
+        self,
+        x,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        if (num_rows * num_cols > self.max_tokens_per_msa) and not torch.is_grad_enabled():
+            return self._batched_forward(x, self_attn_mask, self_attn_padding_mask)
+        else:
+            scaling = self.align_scaling(x)
+            attn_weights = self.compute_attention_weights(
+                x, scaling, self_attn_mask, self_attn_padding_mask
+            )
+            attn_probs = attn_weights.softmax(-1)
+            attn_probs = self.dropout_module(attn_probs)
+            output = self.compute_attention_update(x, attn_probs)
+            return output, attn_probs
+class ColumnSelfAttention(nn.Module):
+    """Compute self-attention over columns of a 2D input."""
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        max_tokens_per_msa: int = 2 ** 16,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.scaling = self.head_dim ** -0.5
+        self.max_tokens_per_msa = max_tokens_per_msa
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.dropout_module = nn.Dropout(dropout)
+    def _batched_forward(
+        self,
+        x,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        max_cols = max(1, self.max_tokens_per_msa // num_rows)
+        outputs = []
+        attns = []
+        for start in range(0, num_cols, max_cols):
+            output, attn = self(
+                x[:, start : start + max_cols],
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask[:, :, start : start + max_cols]
+                if self_attn_padding_mask is not None
+                else None,
+            )
+            outputs.append(output)
+            attns.append(attn)
+        output = torch.cat(outputs, 1)
+        attns = torch.cat(attns, 1)
+        return output, attns
+    def compute_attention_update(
+        self,
+        x,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        if num_rows == 1:
+            # if there is only 1 position, this is equivalent and doesn't break with padding
+            attn_probs = torch.ones(
+                self.num_heads,
+                num_cols,
+                batch_size,
+                num_rows,
+                num_rows,
+                device=x.device,
+                dtype=x.dtype,
+            )
+            output = self.out_proj(self.v_proj(x))
+        else:
+            q = self.q_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+            k = self.k_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+            v = self.v_proj(x).view(num_rows, num_cols, batch_size, self.num_heads, self.head_dim)
+            q *= self.scaling
+            attn_weights = torch.einsum("icnhd,jcnhd->hcnij", q, k)
+            if self_attn_mask is not None:
+                raise NotImplementedError
+            if self_attn_padding_mask is not None:
+                attn_weights = attn_weights.masked_fill(
+                    self_attn_padding_mask.permute(2, 0, 1).unsqueeze(0).unsqueeze(3),
+                    -10000,
+                )
+            attn_probs = attn_weights.softmax(-1)
+            attn_probs = self.dropout_module(attn_probs)
+            context = torch.einsum("hcnij,jcnhd->icnhd", attn_probs, v)
+            context = context.contiguous().view(num_rows, num_cols, batch_size, embed_dim)
+            output = self.out_proj(context)
+        return output, attn_probs
+    def forward(
+        self,
+        x,
+        self_attn_mask=None,
+        self_attn_padding_mask=None,
+    ):
+        num_rows, num_cols, batch_size, embed_dim = x.size()
+        # if False and num_rows * num_cols > 2 ** 14 and not torch.is_grad_enabled():
+        if (num_rows * num_cols) > self.max_tokens_per_msa and not torch.is_grad_enabled():
+            return self._batched_forward(
+                x,
+                self_attn_mask,
+                self_attn_padding_mask,
+            )
+        else:
+            return self.compute_attention_update(x, self_attn_mask, self_attn_padding_mask)

esm/esm/constants.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# fmt: off
+proteinseq_toks = {
+    'toks': ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-']
+}
+# fmt: on

esm/esm/data.py ADDED Viewed

	@@ -0,0 +1,493 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+import os
+from typing import Sequence, Tuple, List, Union
+import pickle
+import re
+import shutil
+import torch
+from pathlib import Path
+from esm.constants import proteinseq_toks
+RawMSA = Sequence[Tuple[str, str]]
+class FastaBatchedDataset(object):
+    def __init__(self, sequence_labels, sequence_strs):
+        self.sequence_labels = list(sequence_labels)
+        self.sequence_strs = list(sequence_strs)
+    @classmethod
+    def from_file(cls, fasta_file):
+        sequence_labels, sequence_strs = [], []
+        cur_seq_label = None
+        buf = []
+        def _flush_current_seq():
+            nonlocal cur_seq_label, buf
+            if cur_seq_label is None:
+                return
+            sequence_labels.append(cur_seq_label)
+            sequence_strs.append("".join(buf))
+            cur_seq_label = None
+            buf = []
+        with open(fasta_file, "r") as infile:
+            for line_idx, line in enumerate(infile):
+                if line.startswith(">"):  # label line
+                    _flush_current_seq()
+                    line = line[1:].strip()
+                    if len(line) > 0:
+                        cur_seq_label = line
+                    else:
+                        cur_seq_label = f"seqnum{line_idx:09d}"
+                else:  # sequence line
+                    buf.append(line.strip())
+        _flush_current_seq()
+        assert len(set(sequence_labels)) == len(
+            sequence_labels
+        ), "Found duplicate sequence labels"
+        return cls(sequence_labels, sequence_strs)
+    def __len__(self):
+        return len(self.sequence_labels)
+    def __getitem__(self, idx):
+        return self.sequence_labels[idx], self.sequence_strs[idx]
+    def get_batch_indices(self, toks_per_batch, extra_toks_per_seq=0):
+        sizes = [(len(s), i) for i, s in enumerate(self.sequence_strs)]
+        sizes.sort()
+        batches = []
+        buf = []
+        max_len = 0
+        def _flush_current_buf():
+            nonlocal max_len, buf
+            if len(buf) == 0:
+                return
+            batches.append(buf)
+            buf = []
+            max_len = 0
+        for sz, i in sizes:
+            sz += extra_toks_per_seq
+            if max(sz, max_len) * (len(buf) + 1) > toks_per_batch:
+                _flush_current_buf()
+            max_len = max(max_len, sz)
+            buf.append(i)
+        _flush_current_buf()
+        return batches
+class Alphabet(object):
+    def __init__(
+        self,
+        standard_toks: Sequence[str],
+        prepend_toks: Sequence[str] = ("<null_0>", "<pad>", "<eos>", "<unk>"),
+        append_toks: Sequence[str] = ("<cls>", "<mask>", "<sep>"),
+        prepend_bos: bool = True,
+        append_eos: bool = False,
+        use_msa: bool = False,
+    ):
+        self.standard_toks = list(standard_toks)
+        self.prepend_toks = list(prepend_toks)
+        self.append_toks = list(append_toks)
+        self.prepend_bos = prepend_bos
+        self.append_eos = append_eos
+        self.use_msa = use_msa
+        self.all_toks = list(self.prepend_toks)
+        self.all_toks.extend(self.standard_toks)
+        for i in range((8 - (len(self.all_toks) % 8)) % 8):
+            self.all_toks.append(f"<null_{i  + 1}>")
+        self.all_toks.extend(self.append_toks)
+        self.tok_to_idx = {tok: i for i, tok in enumerate(self.all_toks)}
+        self.unk_idx = self.tok_to_idx["<unk>"]
+        self.padding_idx = self.get_idx("<pad>")
+        self.cls_idx = self.get_idx("<cls>")
+        self.mask_idx = self.get_idx("<mask>")
+        self.eos_idx = self.get_idx("<eos>")
+        self.all_special_tokens = ['<eos>', '<unk>', '<pad>', '<cls>', '<mask>']
+        self.unique_no_split_tokens = self.all_toks
+    def __len__(self):
+        return len(self.all_toks)
+    def get_idx(self, tok):
+        return self.tok_to_idx.get(tok, self.unk_idx)
+    def get_tok(self, ind):
+        return self.all_toks[ind]
+    def to_dict(self):
+        return self.tok_to_idx.copy()
+    def get_batch_converter(self, truncation_seq_length: int = None):
+        if self.use_msa:
+            return MSABatchConverter(self, truncation_seq_length)
+        else:
+            return BatchConverter(self, truncation_seq_length)
+    @classmethod
+    def from_architecture(cls, name: str) -> "Alphabet":
+        if name in ("ESM-1", "protein_bert_base"):
+            standard_toks = proteinseq_toks["toks"]
+            prepend_toks: Tuple[str, ...] = ("<null_0>", "<pad>", "<eos>", "<unk>")
+            append_toks: Tuple[str, ...] = ("<cls>", "<mask>", "<sep>")
+            prepend_bos = True
+            append_eos = False
+            use_msa = False
+        elif name in ("ESM-1b", "roberta_large"):
+            standard_toks = proteinseq_toks["toks"]
+            prepend_toks = ("<cls>", "<pad>", "<eos>", "<unk>")
+            append_toks = ("<mask>",)
+            prepend_bos = True
+            append_eos = True
+            use_msa = False
+        elif name in ("MSA Transformer", "msa_transformer"):
+            standard_toks = proteinseq_toks["toks"]
+            prepend_toks = ("<cls>", "<pad>", "<eos>", "<unk>")
+            append_toks = ("<mask>",)
+            prepend_bos = True
+            append_eos = False
+            use_msa = True
+        elif "invariant_gvp" in name.lower():
+            standard_toks = proteinseq_toks["toks"]
+            prepend_toks = ("<null_0>", "<pad>", "<eos>", "<unk>")
+            append_toks = ("<mask>", "<cath>", "<af2>")
+            prepend_bos = True
+            append_eos = False
+            use_msa = False
+        else:
+            raise ValueError("Unknown architecture selected")
+        return cls(standard_toks, prepend_toks, append_toks, prepend_bos, append_eos, use_msa)
+    def _tokenize(self, text) -> str:
+        return text.split()
+    def tokenize(self, text, **kwargs) -> List[str]:
+        """
+        Inspired by https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py
+        Converts a string in a sequence of tokens, using the tokenizer.
+        Args:
+            text (:obj:`str`):
+                The sequence to be encoded.
+        Returns:
+            :obj:`List[str]`: The list of tokens.
+        """
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                # AddedToken can control whitespace stripping around them.
+                # We use them for GPT2 and Roberta to have different behavior depending on the special token
+                # Cf. https://github.com/huggingface/transformers/pull/2778
+                # and https://github.com/huggingface/transformers/issues/3788
+                # We strip left and right by default
+                if i < len(split_text) - 1:
+                    sub_text = sub_text.rstrip()
+                if i > 0:
+                    sub_text = sub_text.lstrip()
+                if i == 0 and not sub_text:
+                    result.append(tok)
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result.append(sub_text)
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result.append(sub_text)
+                    result.append(tok)
+            return result
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.unique_no_split_tokens:
+                        tokenized_text.extend(split_on_token(tok, sub_text))
+                    else:
+                        tokenized_text.append(sub_text)
+                text_list = tokenized_text
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token)
+                        if token not in self.unique_no_split_tokens
+                        else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
+        no_split_token = self.unique_no_split_tokens
+        tokenized_text = split_on_tokens(no_split_token, text)
+        return tokenized_text
+    def encode(self, text):
+        return [self.tok_to_idx[tok] for tok in self.tokenize(text)]
+class BatchConverter(object):
+    """Callable to convert an unprocessed (labels + strings) batch to a
+    processed (labels + tensor) batch.
+    """
+    def __init__(self, alphabet, truncation_seq_length: int = None):
+        self.alphabet = alphabet
+        self.truncation_seq_length = truncation_seq_length
+    def __call__(self, raw_batch: Sequence[Tuple[str, str]]):
+        # RoBERTa uses an eos token, while ESM-1 does not.
+        batch_size = len(raw_batch)
+        batch_labels, seq_str_list = zip(*raw_batch)
+        seq_encoded_list = [self.alphabet.encode(seq_str) for seq_str in seq_str_list]
+        if self.truncation_seq_length:
+            seq_encoded_list = [seq_str[:self.truncation_seq_length] for seq_str in seq_encoded_list]
+        max_len = max(len(seq_encoded) for seq_encoded in seq_encoded_list)
+        tokens = torch.empty(
+            (
+                batch_size,
+                max_len + int(self.alphabet.prepend_bos) + int(self.alphabet.append_eos),
+            ),
+            dtype=torch.int64,
+        )
+        tokens.fill_(self.alphabet.padding_idx)
+        labels = []
+        strs = []
+        for i, (label, seq_str, seq_encoded) in enumerate(
+            zip(batch_labels, seq_str_list, seq_encoded_list)
+        ):
+            labels.append(label)
+            strs.append(seq_str)
+            if self.alphabet.prepend_bos:
+                tokens[i, 0] = self.alphabet.cls_idx
+            seq = torch.tensor(seq_encoded, dtype=torch.int64)
+            tokens[
+                i,
+                int(self.alphabet.prepend_bos) : len(seq_encoded)
+                + int(self.alphabet.prepend_bos),
+            ] = seq
+            if self.alphabet.append_eos:
+                tokens[i, len(seq_encoded) + int(self.alphabet.prepend_bos)] = self.alphabet.eos_idx
+        return labels, strs, tokens
+class MSABatchConverter(BatchConverter):
+    def __call__(self, inputs: Union[Sequence[RawMSA], RawMSA]):
+        if isinstance(inputs[0][0], str):
+            # Input is a single MSA
+            raw_batch: Sequence[RawMSA] = [inputs]  # type: ignore
+        else:
+            raw_batch = inputs  # type: ignore
+        batch_size = len(raw_batch)
+        max_alignments = max(len(msa) for msa in raw_batch)
+        max_seqlen = max(len(msa[0][1]) for msa in raw_batch)
+        tokens = torch.empty(
+            (
+                batch_size,
+                max_alignments,
+                max_seqlen + int(self.alphabet.prepend_bos) + int(self.alphabet.append_eos),
+            ),
+            dtype=torch.int64,
+        )
+        tokens.fill_(self.alphabet.padding_idx)
+        labels = []
+        strs = []
+        for i, msa in enumerate(raw_batch):
+            msa_seqlens = set(len(seq) for _, seq in msa)
+            if not len(msa_seqlens) == 1:
+                raise RuntimeError(
+                    "Received unaligned sequences for input to MSA, all sequence "
+                    "lengths must be equal."
+                )
+            msa_labels, msa_strs, msa_tokens = super().__call__(msa)
+            labels.append(msa_labels)
+            strs.append(msa_strs)
+            tokens[i, : msa_tokens.size(0), : msa_tokens.size(1)] = msa_tokens
+        return labels, strs, tokens
+def read_fasta(
+    path,
+    keep_gaps=True,
+    keep_insertions=True,
+    to_upper=False,
+):
+    with open(path, "r") as f:
+        for result in read_alignment_lines(
+            f, keep_gaps=keep_gaps, keep_insertions=keep_insertions, to_upper=to_upper
+        ):
+            yield result
+def read_alignment_lines(
+    lines,
+    keep_gaps=True,
+    keep_insertions=True,
+    to_upper=False,
+):
+    seq = desc = None
+    def parse(s):
+        if not keep_gaps:
+            s = re.sub("-", "", s)
+        if not keep_insertions:
+            s = re.sub("[a-z]", "", s)
+        return s.upper() if to_upper else s
+    for line in lines:
+        # Line may be empty if seq % file_line_width == 0
+        if len(line) > 0 and line[0] == ">":
+            if seq is not None:
+                yield desc, parse(seq)
+            desc = line.strip()
+            seq = ""
+        else:
+            assert isinstance(seq, str)
+            seq += line.strip()
+    assert isinstance(seq, str) and isinstance(desc, str)
+    yield desc, parse(seq)
+class ESMStructuralSplitDataset(torch.utils.data.Dataset):
+    """
+    Structural Split Dataset as described in section A.10 of the supplement of our paper.
+    https://doi.org/10.1101/622803
+    We use the full version of SCOPe 2.07, clustered at 90% sequence identity,
+    generated on January 23, 2020.
+    For each SCOPe domain:
+        - We extract the sequence from the corresponding PDB file
+        - We extract the 3D coordinates of the Carbon beta atoms, aligning them
+          to the sequence. We put NaN where Cb atoms are missing.
+        - From the 3D coordinates, we calculate a pairwise distance map, based
+          on L2 distance
+        - We use DSSP to generate secondary structure labels for the corresponding
+          PDB file. This is also aligned to the sequence. We put - where SSP
+          labels are missing.
+    For each SCOPe classification level of family/superfamily/fold (in order of difficulty),
+    we have split the data into 5 partitions for cross validation. These are provided
+    in a downloaded splits folder, in the format:
+            splits/{split_level}/{cv_partition}/{train|valid}.txt
+    where train is the partition and valid is the concatentation of the remaining 4.
+    For each SCOPe domain, we provide a pkl dump that contains:
+        - seq    : The domain sequence, stored as an L-length string
+        - ssp    : The secondary structure labels, stored as an L-length string
+        - dist   : The distance map, stored as an LxL numpy array
+        - coords : The 3D coordinates, stored as an Lx3 numpy array
+    """
+    base_folder = "structural-data"
+    file_list = [
+        #  url  tar filename   filename      MD5 Hash
+        (
+            "https://dl.fbaipublicfiles.com/fair-esm/structural-data/splits.tar.gz",
+            "splits.tar.gz",
+            "splits",
+            "456fe1c7f22c9d3d8dfe9735da52411d",
+        ),
+        (
+            "https://dl.fbaipublicfiles.com/fair-esm/structural-data/pkl.tar.gz",
+            "pkl.tar.gz",
+            "pkl",
+            "644ea91e56066c750cd50101d390f5db",
+        ),
+    ]
+    def __init__(
+        self,
+        split_level,
+        cv_partition,
+        split,
+        root_path=os.path.expanduser("~/.cache/torch/data/esm"),
+        download=False,
+    ):
+        super().__init__()
+        assert split in [
+            "train",
+            "valid",
+        ], "train_valid must be 'train' or 'valid'"
+        self.root_path = root_path
+        self.base_path = os.path.join(self.root_path, self.base_folder)
+        # check if root path has what you need or else download it
+        if download:
+            self.download()
+        self.split_file = os.path.join(
+            self.base_path, "splits", split_level, cv_partition, f"{split}.txt"
+        )
+        self.pkl_dir = os.path.join(self.base_path, "pkl")
+        self.names = []
+        with open(self.split_file) as f:
+            self.names = f.read().splitlines()
+    def __len__(self):
+        return len(self.names)
+    def _check_exists(self) -> bool:
+        for (_, _, filename, _) in self.file_list:
+            fpath = os.path.join(self.base_path, filename)
+            if not os.path.exists(fpath) or not os.path.isdir(fpath):
+                return False
+        return True
+    def download(self):
+        if self._check_exists():
+            print("Files already downloaded and verified")
+            return
+        from torchvision.datasets.utils import download_url
+        for url, tar_filename, filename, md5_hash in self.file_list:
+            download_path = os.path.join(self.base_path, tar_filename)
+            download_url(url=url, root=self.base_path, filename=tar_filename, md5=md5_hash)
+            shutil.unpack_archive(download_path, self.base_path)
+    def __getitem__(self, idx):
+        """
+        Returns a dict with the following entires
+         - seq : Str (domain sequence)
+         - ssp : Str (SSP labels)
+         - dist : np.array (distance map)
+         - coords : np.array (3D coordinates)
+        """
+        name = self.names[idx]
+        pkl_fname = os.path.join(self.pkl_dir, name[1:3], f"{name}.pkl")
+        with open(pkl_fname, "rb") as f:
+            obj = pickle.load(f)
+        return obj

esm/esm/inverse_folding/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import gvp_transformer
+from . import util
+from . import multichain_util

esm/esm/inverse_folding/features.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Portions of this file were adapted from the open source code for the following
+# two papers:
+#
+#   Ingraham, J., Garg, V., Barzilay, R., & Jaakkola, T. (2019). Generative
+#   models for graph-based protein design. Advances in Neural Information
+#   Processing Systems, 32.
+#
+#   Jing, B., Eismann, S., Suriana, P., Townshend, R. J. L., & Dror, R. (2020).
+#   Learning from Protein Structure with Geometric Vector Perceptrons. In
+#   International Conference on Learning Representations.
+#
+# MIT License
+#
+# Copyright (c) 2020 Bowen Jing, Stephan Eismann, Patricia Suriana, Raphael Townshend, Ron Dror
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ================================================================
+# The below license applies to the portions of the code (parts of
+# src/datasets.py and src/models.py) adapted from Ingraham, et al.
+# ================================================================
+#
+# MIT License
+#
+# Copyright (c) 2019 John Ingraham, Vikas Garg, Regina Barzilay, Tommi Jaakkola
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .gvp_utils import flatten_graph
+from .gvp_modules import GVP, LayerNorm
+from .util import normalize, norm, nan_to_num, rbf
+class GVPInputFeaturizer(nn.Module):
+    @staticmethod
+    def get_node_features(coords, coord_mask, with_coord_mask=True):
+        # scalar features
+        node_scalar_features = GVPInputFeaturizer._dihedrals(coords)
+        if with_coord_mask:
+            node_scalar_features = torch.cat([
+                node_scalar_features,
+                coord_mask.float().unsqueeze(-1)
+            ], dim=-1)
+        # vector features
+        X_ca = coords[:, :, 1]
+        orientations = GVPInputFeaturizer._orientations(X_ca)
+        sidechains = GVPInputFeaturizer._sidechains(coords)
+        node_vector_features = torch.cat([orientations, sidechains.unsqueeze(-2)], dim=-2)
+        return node_scalar_features, node_vector_features
+    @staticmethod
+    def _orientations(X):
+        forward = normalize(X[:, 1:] - X[:, :-1])
+        backward = normalize(X[:, :-1] - X[:, 1:])
+        forward = F.pad(forward, [0, 0, 0, 1])
+        backward = F.pad(backward, [0, 0, 1, 0])
+        return torch.cat([forward.unsqueeze(-2), backward.unsqueeze(-2)], -2)
+    @staticmethod
+    def _sidechains(X):
+        n, origin, c = X[:, :, 0], X[:, :, 1], X[:, :, 2]
+        c, n = normalize(c - origin), normalize(n - origin)
+        bisector = normalize(c + n)
+        perp = normalize(torch.cross(c, n, dim=-1))
+        vec = -bisector * math.sqrt(1 / 3) - perp * math.sqrt(2 / 3)
+        return vec
+    @staticmethod
+    def _dihedrals(X, eps=1e-7):
+        X = torch.flatten(X[:, :, :3], 1, 2)
+        bsz = X.shape[0]
+        dX = X[:, 1:] - X[:, :-1]
+        U = normalize(dX, dim=-1)
+        u_2 = U[:, :-2]
+        u_1 = U[:, 1:-1]
+        u_0 = U[:, 2:]
+        # Backbone normals
+        n_2 = normalize(torch.cross(u_2, u_1, dim=-1), dim=-1)
+        n_1 = normalize(torch.cross(u_1, u_0, dim=-1), dim=-1)
+        # Angle between normals
+        cosD = torch.sum(n_2 * n_1, -1)
+        cosD = torch.clamp(cosD, -1 + eps, 1 - eps)
+        D = torch.sign(torch.sum(u_2 * n_1, -1)) * torch.acos(cosD)
+        # This scheme will remove phi[0], psi[-1], omega[-1]
+        D = F.pad(D, [1, 2])
+        D = torch.reshape(D, [bsz, -1, 3])
+        # Lift angle representations to the circle
+        D_features = torch.cat([torch.cos(D), torch.sin(D)], -1)
+        return D_features
+    @staticmethod
+    def _positional_embeddings(edge_index,
+                               num_embeddings=None,
+                               num_positional_embeddings=16,
+                               period_range=[2, 1000]):
+        # From https://github.com/jingraham/neurips19-graph-protein-design
+        num_embeddings = num_embeddings or num_positional_embeddings
+        d = edge_index[0] - edge_index[1]
+        frequency = torch.exp(
+            torch.arange(0, num_embeddings, 2, dtype=torch.float32,
+                device=edge_index.device)
+            * -(np.log(10000.0) / num_embeddings)
+        )
+        angles = d.unsqueeze(-1) * frequency
+        E = torch.cat((torch.cos(angles), torch.sin(angles)), -1)
+        return E
+    @staticmethod
+    def _dist(X, coord_mask, padding_mask, top_k_neighbors, eps=1e-8):
+        """ Pairwise euclidean distances """
+        bsz, maxlen = X.size(0), X.size(1)
+        coord_mask_2D = torch.unsqueeze(coord_mask,1) * torch.unsqueeze(coord_mask,2)
+        residue_mask = ~padding_mask
+        residue_mask_2D = torch.unsqueeze(residue_mask,1) * torch.unsqueeze(residue_mask,2)
+        dX = torch.unsqueeze(X,1) - torch.unsqueeze(X,2)
+        D = coord_mask_2D * norm(dX, dim=-1)
+        # sorting preference: first those with coords, then among the residues that
+        # exist but are masked use distance in sequence as tie breaker, and then the
+        # residues that came from padding are last
+        seqpos = torch.arange(maxlen, device=X.device)
+        Dseq = torch.abs(seqpos.unsqueeze(1) - seqpos.unsqueeze(0)).repeat(bsz, 1, 1)
+        D_adjust = nan_to_num(D) + (~coord_mask_2D) * (1e8 + Dseq*1e6) + (
+            ~residue_mask_2D) * (1e10)
+        if top_k_neighbors == -1:
+            D_neighbors = D_adjust
+            E_idx = seqpos.repeat(
+                    *D_neighbors.shape[:-1], 1)
+        else:
+            # Identify k nearest neighbors (including self)
+            k = min(top_k_neighbors, X.size(1))
+            D_neighbors, E_idx = torch.topk(D_adjust, k, dim=-1, largest=False)
+        coord_mask_neighbors = (D_neighbors < 5e7)
+        residue_mask_neighbors = (D_neighbors < 5e9)
+        return D_neighbors, E_idx, coord_mask_neighbors, residue_mask_neighbors
+class Normalize(nn.Module):
+    def __init__(self, features, epsilon=1e-6):
+        super(Normalize, self).__init__()
+        self.gain = nn.Parameter(torch.ones(features))
+        self.bias = nn.Parameter(torch.zeros(features))
+        self.epsilon = epsilon
+    def forward(self, x, dim=-1):
+        mu = x.mean(dim, keepdim=True)
+        sigma = torch.sqrt(x.var(dim, keepdim=True) + self.epsilon)
+        gain = self.gain
+        bias = self.bias
+        # Reshape
+        if dim != -1:
+            shape = [1] * len(mu.size())
+            shape[dim] = self.gain.size()[0]
+            gain = gain.view(shape)
+            bias = bias.view(shape)
+        return gain * (x - mu) / (sigma + self.epsilon) + bias
+class DihedralFeatures(nn.Module):
+    def __init__(self, node_embed_dim):
+        """ Embed dihedral angle features. """
+        super(DihedralFeatures, self).__init__()
+        # 3 dihedral angles; sin and cos of each angle
+        node_in = 6
+        # Normalization and embedding
+        self.node_embedding = nn.Linear(node_in,  node_embed_dim, bias=True)
+        self.norm_nodes = Normalize(node_embed_dim)
+    def forward(self, X):
+        """ Featurize coordinates as an attributed graph """
+        V = self._dihedrals(X)
+        V = self.node_embedding(V)
+        V = self.norm_nodes(V)
+        return V
+    @staticmethod
+    def _dihedrals(X, eps=1e-7, return_angles=False):
+        # First 3 coordinates are N, CA, C
+        X = X[:,:,:3,:].reshape(X.shape[0], 3*X.shape[1], 3)
+        # Shifted slices of unit vectors
+        dX = X[:,1:,:] - X[:,:-1,:]
+        U = F.normalize(dX, dim=-1)
+        u_2 = U[:,:-2,:]
+        u_1 = U[:,1:-1,:]
+        u_0 = U[:,2:,:]
+        # Backbone normals
+        n_2 = F.normalize(torch.cross(u_2, u_1, dim=-1), dim=-1)
+        n_1 = F.normalize(torch.cross(u_1, u_0, dim=-1), dim=-1)
+        # Angle between normals
+        cosD = (n_2 * n_1).sum(-1)
+        cosD = torch.clamp(cosD, -1+eps, 1-eps)
+        D = torch.sign((u_2 * n_1).sum(-1)) * torch.acos(cosD)
+        # This scheme will remove phi[0], psi[-1], omega[-1]
+        D = F.pad(D, (1,2), 'constant', 0)
+        D = D.view((D.size(0), int(D.size(1)/3), 3))
+        phi, psi, omega = torch.unbind(D,-1)
+        if return_angles:
+            return phi, psi, omega
+        # Lift angle representations to the circle
+        D_features = torch.cat((torch.cos(D), torch.sin(D)), 2)
+        return D_features
+class GVPGraphEmbedding(GVPInputFeaturizer):
+    def __init__(self, args):
+        super().__init__()
+        self.top_k_neighbors = args.top_k_neighbors
+        self.num_positional_embeddings = 16
+        self.remove_edges_without_coords = True
+        node_input_dim = (7, 3)
+        edge_input_dim = (34, 1)
+        node_hidden_dim = (args.node_hidden_dim_scalar,
+                args.node_hidden_dim_vector)
+        edge_hidden_dim = (args.edge_hidden_dim_scalar,
+                args.edge_hidden_dim_vector)
+        self.embed_node = nn.Sequential(
+            GVP(node_input_dim, node_hidden_dim, activations=(None, None)),
+            LayerNorm(node_hidden_dim, eps=1e-4)
+        )
+        self.embed_edge = nn.Sequential(
+            GVP(edge_input_dim, edge_hidden_dim, activations=(None, None)),
+            LayerNorm(edge_hidden_dim, eps=1e-4)
+        )
+        self.embed_confidence = nn.Linear(16, args.node_hidden_dim_scalar)
+    def forward(self, coords, coord_mask, padding_mask, confidence):
+        with torch.no_grad():
+            node_features = self.get_node_features(coords, coord_mask)
+            edge_features, edge_index = self.get_edge_features(
+                coords, coord_mask, padding_mask)
+        node_embeddings_scalar, node_embeddings_vector = self.embed_node(node_features)
+        edge_embeddings = self.embed_edge(edge_features)
+        rbf_rep = rbf(confidence, 0., 1.)
+        node_embeddings = (
+            node_embeddings_scalar + self.embed_confidence(rbf_rep),
+            node_embeddings_vector
+        )
+        node_embeddings, edge_embeddings, edge_index = flatten_graph(
+            node_embeddings, edge_embeddings, edge_index)
+        return node_embeddings, edge_embeddings, edge_index
+    def get_edge_features(self, coords, coord_mask, padding_mask):
+        X_ca = coords[:, :, 1]
+        # Get distances to the top k neighbors
+        E_dist, E_idx, E_coord_mask, E_residue_mask = GVPInputFeaturizer._dist(
+                X_ca, coord_mask, padding_mask, self.top_k_neighbors)
+        # Flatten the graph to be batch size 1 for torch_geometric package
+        dest = E_idx
+        B, L, k = E_idx.shape[:3]
+        src = torch.arange(L, device=E_idx.device).view([1, L, 1]).expand(B, L, k)
+        # After flattening, [2, B, E]
+        edge_index = torch.stack([src, dest], dim=0).flatten(2, 3)
+        # After flattening, [B, E]
+        E_dist = E_dist.flatten(1, 2)
+        E_coord_mask = E_coord_mask.flatten(1, 2).unsqueeze(-1)
+        E_residue_mask = E_residue_mask.flatten(1, 2)
+        # Calculate relative positional embeddings and distance RBF
+        pos_embeddings = GVPInputFeaturizer._positional_embeddings(
+            edge_index,
+            num_positional_embeddings=self.num_positional_embeddings,
+        )
+        D_rbf = rbf(E_dist, 0., 20.)
+        # Calculate relative orientation
+        X_src = X_ca.unsqueeze(2).expand(-1, -1, k, -1).flatten(1, 2)
+        X_dest = torch.gather(
+            X_ca,
+            1,
+            edge_index[1, :, :].unsqueeze(-1).expand([B, L*k, 3])
+        )
+        coord_mask_src = coord_mask.unsqueeze(2).expand(-1, -1, k).flatten(1, 2)
+        coord_mask_dest = torch.gather(
+            coord_mask,
+            1,
+            edge_index[1, :, :].expand([B, L*k])
+        )
+        E_vectors = X_src - X_dest
+        # For the ones without coordinates, substitute in the average vector
+        E_vector_mean = torch.sum(E_vectors * E_coord_mask, dim=1,
+                keepdims=True) / torch.sum(E_coord_mask, dim=1, keepdims=True)
+        E_vectors = E_vectors * E_coord_mask + E_vector_mean * ~(E_coord_mask)
+        # Normalize and remove nans
+        edge_s = torch.cat([D_rbf, pos_embeddings], dim=-1)
+        edge_v = normalize(E_vectors).unsqueeze(-2)
+        edge_s, edge_v = map(nan_to_num, (edge_s, edge_v))
+        # Also add indications of whether the coordinates are present
+        edge_s = torch.cat([
+            edge_s,
+            (~coord_mask_src).float().unsqueeze(-1),
+            (~coord_mask_dest).float().unsqueeze(-1),
+        ], dim=-1)
+        edge_index[:, ~E_residue_mask] = -1
+        if self.remove_edges_without_coords:
+            edge_index[:, ~E_coord_mask.squeeze(-1)] = -1
+        return (edge_s, edge_v), edge_index.transpose(0, 1)

esm/esm/inverse_folding/gvp_encoder.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from argparse import Namespace
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .features import GVPGraphEmbedding
+from .gvp_modules import GVPConvLayer, LayerNorm
+from .gvp_utils import unflatten_graph
+class GVPEncoder(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.embed_graph = GVPGraphEmbedding(args)
+        node_hidden_dim = (args.node_hidden_dim_scalar,
+                args.node_hidden_dim_vector)
+        edge_hidden_dim = (args.edge_hidden_dim_scalar,
+                args.edge_hidden_dim_vector)
+        conv_activations = (F.relu, torch.sigmoid)
+        self.encoder_layers = nn.ModuleList(
+                GVPConvLayer(
+                    node_hidden_dim,
+                    edge_hidden_dim,
+                    drop_rate=args.dropout,
+                    vector_gate=True,
+                    attention_heads=0,
+                    n_message=3,
+                    conv_activations=conv_activations,
+                    n_edge_gvps=0,
+                    eps=1e-4,
+                    layernorm=True,
+                )
+            for i in range(args.num_encoder_layers)
+        )
+    def forward(self, coords, coord_mask, padding_mask, confidence):
+        node_embeddings, edge_embeddings, edge_index = self.embed_graph(
+                coords, coord_mask, padding_mask, confidence)
+        for i, layer in enumerate(self.encoder_layers):
+            node_embeddings, edge_embeddings = layer(node_embeddings,
+                    edge_index, edge_embeddings)
+        node_embeddings = unflatten_graph(node_embeddings, coords.shape[0])
+        return node_embeddings

esm/esm/inverse_folding/gvp_modules.py ADDED Viewed

	@@ -0,0 +1,473 @@

+# Contents of this file are from the open source code for
+#
+#   Jing, B., Eismann, S., Suriana, P., Townshend, R. J. L., & Dror, R. (2020).
+#   Learning from Protein Structure with Geometric Vector Perceptrons. In
+#   International Conference on Learning Representations.
+#
+# MIT License
+#
+# Copyright (c) 2020 Bowen Jing, Stephan Eismann, Patricia Suriana, Raphael Townshend, Ron Dror
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import typing as T
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch_geometric.nn import MessagePassing
+from torch_scatter import scatter_add, scatter
+def tuple_size(tp):
+    return tuple([0 if a is None else a.size() for a in tp])
+def tuple_sum(tp1, tp2):
+    s1, v1 = tp1
+    s2, v2 = tp2
+    if v2 is None and v2 is None:
+        return (s1 + s2, None)
+    return (s1 + s2, v1 + v2)
+def tuple_cat(*args, dim=-1):
+    '''
+    Concatenates any number of tuples (s, V) elementwise.
+    :param dim: dimension along which to concatenate when viewed
+                as the `dim` index for the scalar-channel tensors.
+                This means that `dim=-1` will be applied as
+                `dim=-2` for the vector-channel tensors.
+    '''
+    dim %= len(args[0][0].shape)
+    s_args, v_args = list(zip(*args))
+    return torch.cat(s_args, dim=dim), torch.cat(v_args, dim=dim)
+def tuple_index(x, idx):
+    '''
+    Indexes into a tuple (s, V) along the first dimension.
+    :param idx: any object which can be used to index into a `torch.Tensor`
+    '''
+    return x[0][idx], x[1][idx]
+def randn(n, dims, device="cpu"):
+    '''
+    Returns random tuples (s, V) drawn elementwise from a normal distribution.
+    :param n: number of data points
+    :param dims: tuple of dimensions (n_scalar, n_vector)
+    :return: (s, V) with s.shape = (n, n_scalar) and
+             V.shape = (n, n_vector, 3)
+    '''
+    return torch.randn(n, dims[0], device=device), \
+            torch.randn(n, dims[1], 3, device=device)
+def _norm_no_nan(x, axis=-1, keepdims=False, eps=1e-8, sqrt=True):
+    '''
+    L2 norm of tensor clamped above a minimum value `eps`.
+    :param sqrt: if `False`, returns the square of the L2 norm
+    '''
+    # clamp is slow
+    # out = torch.clamp(torch.sum(torch.square(x), axis, keepdims), min=eps)
+    out = torch.sum(torch.square(x), axis, keepdims) + eps
+    return torch.sqrt(out) if sqrt else out
+def _split(x, nv):
+    '''
+    Splits a merged representation of (s, V) back into a tuple.
+    Should be used only with `_merge(s, V)` and only if the tuple
+    representation cannot be used.
+    :param x: the `torch.Tensor` returned from `_merge`
+    :param nv: the number of vector channels in the input to `_merge`
+    '''
+    v = torch.reshape(x[..., -3*nv:], x.shape[:-1] + (nv, 3))
+    s = x[..., :-3*nv]
+    return s, v
+def _merge(s, v):
+    '''
+    Merges a tuple (s, V) into a single `torch.Tensor`, where the
+    vector channels are flattened and appended to the scalar channels.
+    Should be used only if the tuple representation cannot be used.
+    Use `_split(x, nv)` to reverse.
+    '''
+    v = torch.reshape(v, v.shape[:-2] + (3*v.shape[-2],))
+    return torch.cat([s, v], -1)
+class GVP(nn.Module):
+    '''
+    Geometric Vector Perceptron. See manuscript and README.md
+    for more details.
+    :param in_dims: tuple (n_scalar, n_vector)
+    :param out_dims: tuple (n_scalar, n_vector)
+    :param h_dim: intermediate number of vector channels, optional
+    :param activations: tuple of functions (scalar_act, vector_act)
+    :param tuple_io: whether to keep accepting tuple inputs and outputs when vi
+    or vo = 0
+    '''
+    def __init__(self, in_dims, out_dims, h_dim=None, vector_gate=False,
+                 activations=(F.relu, torch.sigmoid), tuple_io=True,
+                 eps=1e-8):
+        super(GVP, self).__init__()
+        self.si, self.vi = in_dims
+        self.so, self.vo = out_dims
+        self.tuple_io = tuple_io
+        if self.vi:
+            self.h_dim = h_dim or max(self.vi, self.vo)
+            self.wh = nn.Linear(self.vi, self.h_dim, bias=False)
+            self.ws = nn.Linear(self.h_dim + self.si, self.so)
+            if self.vo:
+                self.wv = nn.Linear(self.h_dim, self.vo, bias=False)
+                if vector_gate:
+                    self.wg = nn.Linear(self.so, self.vo)
+        else:
+            self.ws = nn.Linear(self.si, self.so)
+        self.vector_gate = vector_gate
+        self.scalar_act, self.vector_act = activations
+        self.eps = eps
+    def forward(self, x):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`,
+                  or (if vectors_in is 0), a single `torch.Tensor`
+        :return: tuple (s, V) of `torch.Tensor`,
+                 or (if vectors_out is 0), a single `torch.Tensor`
+        '''
+        if self.vi:
+            s, v = x
+            v = torch.transpose(v, -1, -2)
+            vh = self.wh(v)
+            vn = _norm_no_nan(vh, axis=-2, eps=self.eps)
+            s = self.ws(torch.cat([s, vn], -1))
+            if self.scalar_act:
+                s = self.scalar_act(s)
+            if self.vo:
+                v = self.wv(vh)
+                v = torch.transpose(v, -1, -2)
+                if self.vector_gate:
+                    g = self.wg(s).unsqueeze(-1)
+                else:
+                    g = _norm_no_nan(v, axis=-1, keepdims=True, eps=self.eps)
+                if self.vector_act:
+                    g = self.vector_act(g)
+                    v = v * g
+        else:
+            if self.tuple_io:
+                assert x[1] is None
+                x = x[0]
+            s = self.ws(x)
+            if self.scalar_act:
+                s = self.scalar_act(s)
+            if self.vo:
+                v = torch.zeros(list(s.shape)[:-1] + [self.vo, 3],
+                        device=s.device)
+        if self.vo:
+            return (s, v)
+        elif self.tuple_io:
+            return (s, None)
+        else:
+            return s
+class _VDropout(nn.Module):
+    '''
+    Vector channel dropout where the elements of each
+    vector channel are dropped together.
+    '''
+    def __init__(self, drop_rate):
+        super(_VDropout, self).__init__()
+        self.drop_rate = drop_rate
+    def forward(self, x):
+        '''
+        :param x: `torch.Tensor` corresponding to vector channels
+        '''
+        if x is None:
+            return None
+        device = x.device
+        if not self.training:
+            return x
+        mask = torch.bernoulli(
+            (1 - self.drop_rate) * torch.ones(x.shape[:-1], device=device)
+        ).unsqueeze(-1)
+        x = mask * x / (1 - self.drop_rate)
+        return x
+class Dropout(nn.Module):
+    '''
+    Combined dropout for tuples (s, V).
+    Takes tuples (s, V) as input and as output.
+    '''
+    def __init__(self, drop_rate):
+        super(Dropout, self).__init__()
+        self.sdropout = nn.Dropout(drop_rate)
+        self.vdropout = _VDropout(drop_rate)
+    def forward(self, x):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`,
+                  or single `torch.Tensor`
+                  (will be assumed to be scalar channels)
+        '''
+        if type(x) is torch.Tensor:
+            return self.sdropout(x)
+        s, v = x
+        return self.sdropout(s), self.vdropout(v)
+class LayerNorm(nn.Module):
+    '''
+    Combined LayerNorm for tuples (s, V).
+    Takes tuples (s, V) as input and as output.
+    '''
+    def __init__(self, dims, tuple_io=True, eps=1e-8):
+        super(LayerNorm, self).__init__()
+        self.tuple_io = tuple_io
+        self.s, self.v = dims
+        self.scalar_norm = nn.LayerNorm(self.s)
+        self.eps = eps
+    def forward(self, x):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`,
+                  or single `torch.Tensor`
+                  (will be assumed to be scalar channels)
+        '''
+        if not self.v:
+            if self.tuple_io:
+                return self.scalar_norm(x[0]), None
+            return self.scalar_norm(x)
+        s, v = x
+        vn = _norm_no_nan(v, axis=-1, keepdims=True, sqrt=False, eps=self.eps)
+        nonzero_mask = (vn > 2 * self.eps)
+        vn = torch.sum(vn * nonzero_mask, dim=-2, keepdim=True
+            ) / (self.eps + torch.sum(nonzero_mask, dim=-2, keepdim=True))
+        vn = torch.sqrt(vn + self.eps)
+        v = nonzero_mask * (v / vn)
+        return self.scalar_norm(s), v
+class GVPConv(MessagePassing):
+    '''
+    Graph convolution / message passing with Geometric Vector Perceptrons.
+    Takes in a graph with node and edge embeddings,
+    and returns new node embeddings.
+    This does NOT do residual updates and pointwise feedforward layers
+    ---see `GVPConvLayer`.
+    :param in_dims: input node embedding dimensions (n_scalar, n_vector)
+    :param out_dims: output node embedding dimensions (n_scalar, n_vector)
+    :param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
+    :param n_layers: number of GVPs in the message function
+    :param module_list: preconstructed message function, overrides n_layers
+    :param aggr: should be "add" if some incoming edges are masked, as in
+                 a masked autoregressive decoder architecture
+    '''
+    def __init__(self, in_dims, out_dims, edge_dims, n_layers=3,
+            vector_gate=False, module_list=None, aggr="mean", eps=1e-8,
+            activations=(F.relu, torch.sigmoid)):
+        super(GVPConv, self).__init__(aggr=aggr)
+        self.eps = eps
+        self.si, self.vi = in_dims
+        self.so, self.vo = out_dims
+        self.se, self.ve = edge_dims
+        module_list = module_list or []
+        if not module_list:
+            if n_layers == 1:
+                module_list.append(
+                    GVP((2*self.si + self.se, 2*self.vi + self.ve),
+                        (self.so, self.vo), activations=(None, None)))
+            else:
+                module_list.append(
+                    GVP((2*self.si + self.se, 2*self.vi + self.ve), out_dims,
+                        vector_gate=vector_gate, activations=activations)
+                )
+                for i in range(n_layers - 2):
+                    module_list.append(GVP(out_dims, out_dims,
+                        vector_gate=vector_gate))
+                module_list.append(GVP(out_dims, out_dims,
+                                       activations=(None, None)))
+        self.message_func = nn.Sequential(*module_list)
+    def forward(self, x, edge_index, edge_attr):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`
+        :param edge_index: array of shape [2, n_edges]
+        :param edge_attr: tuple (s, V) of `torch.Tensor`
+        '''
+        x_s, x_v = x
+        message = self.propagate(edge_index,
+                    s=x_s, v=x_v.reshape(x_v.shape[0], 3*x_v.shape[1]),
+                    edge_attr=edge_attr)
+        return _split(message, self.vo)
+    def message(self, s_i, v_i, s_j, v_j, edge_attr):
+        v_j = v_j.view(v_j.shape[0], v_j.shape[1]//3, 3)
+        v_i = v_i.view(v_i.shape[0], v_i.shape[1]//3, 3)
+        message = tuple_cat((s_j, v_j), edge_attr, (s_i, v_i))
+        message = self.message_func(message)
+        return _merge(*message)
+class GVPConvLayer(nn.Module):
+    '''
+    Full graph convolution / message passing layer with
+    Geometric Vector Perceptrons. Residually updates node embeddings with
+    aggregated incoming messages, applies a pointwise feedforward
+    network to node embeddings, and returns updated node embeddings.
+    To only compute the aggregated messages, see `GVPConv`.
+    :param node_dims: node embedding dimensions (n_scalar, n_vector)
+    :param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
+    :param n_message: number of GVPs to use in message function
+    :param n_feedforward: number of GVPs to use in feedforward function
+    :param drop_rate: drop probability in all dropout layers
+    :param autoregressive: if `True`, this `GVPConvLayer` will be used
+           with a different set of input node embeddings for messages
+           where src >= dst
+    '''
+    def __init__(self, node_dims, edge_dims, vector_gate=False,
+                 n_message=3, n_feedforward=2, drop_rate=.1,
+                 autoregressive=False, attention_heads=0,
+                 conv_activations=(F.relu, torch.sigmoid),
+                 n_edge_gvps=0, layernorm=True, eps=1e-8):
+        super(GVPConvLayer, self).__init__()
+        if attention_heads == 0:
+            self.conv = GVPConv(
+                    node_dims, node_dims, edge_dims, n_layers=n_message,
+                    vector_gate=vector_gate,
+                    aggr="add" if autoregressive else "mean",
+                    activations=conv_activations,
+                    eps=eps,
+            )
+        else:
+            raise NotImplementedError
+        if layernorm:
+            self.norm = nn.ModuleList([LayerNorm(node_dims, eps=eps) for _ in range(2)])
+        else:
+            self.norm = nn.ModuleList([nn.Identity() for _ in range(2)])
+        self.dropout = nn.ModuleList([Dropout(drop_rate) for _ in range(2)])
+        ff_func = []
+        if n_feedforward == 1:
+            ff_func.append(GVP(node_dims, node_dims, activations=(None, None)))
+        else:
+            hid_dims = 4*node_dims[0], 2*node_dims[1]
+            ff_func.append(GVP(node_dims, hid_dims, vector_gate=vector_gate))
+            for i in range(n_feedforward-2):
+                ff_func.append(GVP(hid_dims, hid_dims, vector_gate=vector_gate))
+            ff_func.append(GVP(hid_dims, node_dims, activations=(None, None)))
+        self.ff_func = nn.Sequential(*ff_func)
+        self.edge_message_func = None
+        if n_edge_gvps > 0:
+            si, vi = node_dims
+            se, ve = edge_dims
+            module_list = [
+                GVP((2*si + se, 2*vi + ve), edge_dims, vector_gate=vector_gate)
+            ]
+            for i in range(n_edge_gvps - 2):
+                module_list.append(GVP(edge_dims, edge_dims,
+                    vector_gate=vector_gate))
+            if n_edge_gvps > 1:
+                module_list.append(GVP(edge_dims, edge_dims,
+                    activations=(None, None)))
+            self.edge_message_func = nn.Sequential(*module_list)
+            if layernorm:
+                self.edge_norm = LayerNorm(edge_dims, eps=eps)
+            else:
+                self.edge_norm = nn.Identity()
+            self.edge_dropout = Dropout(drop_rate)
+    def forward(self, x, edge_index, edge_attr,
+                autoregressive_x=None, node_mask=None):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`
+        :param edge_index: array of shape [2, n_edges]
+        :param edge_attr: tuple (s, V) of `torch.Tensor`
+        :param autoregressive_x: tuple (s, V) of `torch.Tensor`.
+                If not `None`, will be used as srcqq node embeddings
+                for forming messages where src >= dst. The corrent node
+                embeddings `x` will still be the base of the update and the
+                pointwise feedforward.
+        :param node_mask: array of type `bool` to index into the first
+                dim of node embeddings (s, V). If not `None`, only
+                these nodes will be updated.
+        '''
+        if self.edge_message_func:
+            src, dst = edge_index
+            if autoregressive_x is None:
+                x_src = x[0][src], x[1][src]
+            else:
+                mask = (src < dst).unsqueeze(-1)
+                x_src = (
+                    torch.where(mask, x[0][src], autoregressive_x[0][src]),
+                    torch.where(mask.unsqueeze(-1), x[1][src],
+                        autoregressive_x[1][src])
+                )
+            x_dst = x[0][dst], x[1][dst]
+            x_edge = (
+                torch.cat([x_src[0], edge_attr[0], x_dst[0]], dim=-1),
+                torch.cat([x_src[1], edge_attr[1], x_dst[1]], dim=-2)
+            )
+            edge_attr_dh = self.edge_message_func(x_edge)
+            edge_attr = self.edge_norm(tuple_sum(edge_attr,
+                self.edge_dropout(edge_attr_dh)))
+        if autoregressive_x is not None:
+            src, dst = edge_index
+            mask = src < dst
+            edge_index_forward = edge_index[:, mask]
+            edge_index_backward = edge_index[:, ~mask]
+            edge_attr_forward = tuple_index(edge_attr, mask)
+            edge_attr_backward = tuple_index(edge_attr, ~mask)
+            dh = tuple_sum(
+                self.conv(x, edge_index_forward, edge_attr_forward),
+                self.conv(autoregressive_x, edge_index_backward, edge_attr_backward)
+            )
+            count = scatter_add(torch.ones_like(dst), dst,
+                        dim_size=dh[0].size(0)).clamp(min=1).unsqueeze(-1)
+            dh = dh[0] / count, dh[1] / count.unsqueeze(-1)
+        else:
+            dh = self.conv(x, edge_index, edge_attr)
+        if node_mask is not None:
+            x_ = x
+            x, dh = tuple_index(x, node_mask), tuple_index(dh, node_mask)
+        x = self.norm[0](tuple_sum(x, self.dropout[0](dh)))
+        dh = self.ff_func(x)
+        x = self.norm[1](tuple_sum(x, self.dropout[1](dh)))
+        if node_mask is not None:
+            x_[0][node_mask], x_[1][node_mask] = x[0], x[1]
+            x = x_
+        return x, edge_attr

esm/esm/inverse_folding/gvp_transformer.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+from typing import Any, Dict, List, Optional, Tuple, NamedTuple
+import torch
+from torch import nn
+from torch import Tensor
+import torch.nn.functional as F
+from scipy.spatial import transform
+from esm.data import Alphabet
+from .features import DihedralFeatures
+from .gvp_encoder import GVPEncoder
+from .gvp_utils import unflatten_graph
+from .gvp_transformer_encoder import GVPTransformerEncoder
+from .transformer_decoder import TransformerDecoder
+from .util import rotate, CoordBatchConverter
+class GVPTransformerModel(nn.Module):
+    """
+    GVP-Transformer inverse folding model.
+    Architecture: Geometric GVP-GNN as initial layers, followed by
+    sequence-to-sequence Transformer encoder and decoder.
+    """
+    def __init__(self, args, alphabet):
+        super().__init__()
+        encoder_embed_tokens = self.build_embedding(
+            args, alphabet, args.encoder_embed_dim,
+        )
+        decoder_embed_tokens = self.build_embedding(
+            args, alphabet, args.decoder_embed_dim,
+        )
+        encoder = self.build_encoder(args, alphabet, encoder_embed_tokens)
+        decoder = self.build_decoder(args, alphabet, decoder_embed_tokens)
+        self.args = args
+        self.encoder = encoder
+        self.decoder = decoder
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        encoder = GVPTransformerEncoder(args, src_dict, embed_tokens)
+        return encoder
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        decoder = TransformerDecoder(
+            args,
+            tgt_dict,
+            embed_tokens,
+        )
+        return decoder
+    @classmethod
+    def build_embedding(cls, args, dictionary, embed_dim):
+        num_embeddings = len(dictionary)
+        padding_idx = dictionary.padding_idx
+        emb = nn.Embedding(num_embeddings, embed_dim, padding_idx)
+        nn.init.normal_(emb.weight, mean=0, std=embed_dim ** -0.5)
+        nn.init.constant_(emb.weight[padding_idx], 0)
+        return emb
+    def forward(
+        self,
+        coords,
+        padding_mask,
+        confidence,
+        prev_output_tokens,
+        return_all_hiddens: bool = False,
+        features_only: bool = False,
+    ):
+        encoder_out = self.encoder(coords, padding_mask, confidence,
+            return_all_hiddens=return_all_hiddens)
+        logits, extra = self.decoder(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            features_only=features_only,
+            return_all_hiddens=return_all_hiddens,
+        )
+        return logits, extra
+    def sample(self, coords, partial_seq=None, temperature=1.0, confidence=None):
+        """
+        Samples sequences based on multinomial sampling (no beam search).
+        Args:
+            coords: L x 3 x 3 list representing one backbone
+            partial_seq: Optional, partial sequence with mask tokens if part of
+                the sequence is known
+            temperature: sampling temperature, use low temperature for higher
+                sequence recovery and high temperature for higher diversity
+            confidence: optional length L list of confidence scores for coordinates
+        """
+        L = len(coords)
+        # Convert to batch format
+        batch_converter = CoordBatchConverter(self.decoder.dictionary)
+        batch_coords, confidence, _, _, padding_mask = (
+            batch_converter([(coords, confidence, None)])
+        )
+        # Start with prepend token
+        mask_idx = self.decoder.dictionary.get_idx('<mask>')
+        sampled_tokens = torch.full((1, 1+L), mask_idx, dtype=int)
+        sampled_tokens[0, 0] = self.decoder.dictionary.get_idx('<cath>')
+        if partial_seq is not None:
+            for i, c in enumerate(partial_seq):
+                sampled_tokens[0, i+1] = self.decoder.dictionary.get_idx(c)
+        # Save incremental states for faster sampling
+        incremental_state = dict()
+        # Run encoder only once
+        encoder_out = self.encoder(batch_coords, padding_mask, confidence)
+        # Decode one token at a time
+        for i in range(1, L+1):
+            if sampled_tokens[0, i] != mask_idx:
+                continue
+            logits, _ = self.decoder(
+                sampled_tokens[:, :i],
+                encoder_out,
+                incremental_state=incremental_state,
+            )
+            logits = logits[0].transpose(0, 1)
+            logits /= temperature
+            probs = F.softmax(logits, dim=-1)
+            sampled_tokens[:, i] = torch.multinomial(probs, 1).squeeze(-1)
+        sampled_seq = sampled_tokens[0, 1:]
+        # Convert back to string via lookup
+        return ''.join([self.decoder.dictionary.get_tok(a) for a in sampled_seq])

esm/esm/inverse_folding/gvp_transformer_encoder.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Contents of this file were adapted from the open source fairseq repository.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import math
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from esm.modules import SinusoidalPositionalEmbedding
+from .features import GVPInputFeaturizer, DihedralFeatures
+from .gvp_encoder import GVPEncoder
+from .transformer_layer import TransformerEncoderLayer
+from .util import nan_to_num, get_rotation_frames, rotate, rbf
+class GVPTransformerEncoder(nn.Module):
+    """
+    Transformer encoder consisting of *args.encoder.layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): encoding dictionary
+        embed_tokens (torch.nn.Embedding): input embedding
+    """
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__()
+        self.args = args
+        self.dictionary = dictionary
+        self.dropout_module = nn.Dropout(args.dropout)
+        embed_dim = embed_tokens.embedding_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)
+        self.embed_positions = SinusoidalPositionalEmbedding(
+            embed_dim,
+            self.padding_idx,
+        )
+        self.embed_gvp_input_features = nn.Linear(15, embed_dim)
+        self.embed_confidence = nn.Linear(16, embed_dim)
+        self.embed_dihedrals = DihedralFeatures(embed_dim)
+        gvp_args = argparse.Namespace()
+        for k, v in vars(args).items():
+            if k.startswith("gvp_"):
+                setattr(gvp_args, k[4:], v)
+        self.gvp_encoder = GVPEncoder(gvp_args)
+        gvp_out_dim = gvp_args.node_hidden_dim_scalar + (3 *
+                gvp_args.node_hidden_dim_vector)
+        self.embed_gvp_output = nn.Linear(gvp_out_dim, embed_dim)
+        self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [self.build_encoder_layer(args) for i in range(args.encoder_layers)]
+        )
+        self.num_layers = len(self.layers)
+        self.layer_norm = nn.LayerNorm(embed_dim)
+    def build_encoder_layer(self, args):
+        return TransformerEncoderLayer(args)
+    def forward_embedding(self, coords, padding_mask, confidence):
+        """
+        Args:
+            coords: N, CA, C backbone coordinates in shape length x 3 (atoms) x 3
+            padding_mask: boolean Tensor (true for padding) of shape length
+            confidence: confidence scores between 0 and 1 of shape length
+        """
+        components = dict()
+        coord_mask = torch.all(torch.all(torch.isfinite(coords), dim=-1), dim=-1)
+        coords = nan_to_num(coords)
+        mask_tokens = (
+            padding_mask * self.dictionary.padding_idx +
+            ~padding_mask * self.dictionary.get_idx("<mask>")
+        )
+        components["tokens"] = self.embed_tokens(mask_tokens) * self.embed_scale
+        components["diherals"] = self.embed_dihedrals(coords)
+        # GVP encoder
+        gvp_out_scalars, gvp_out_vectors = self.gvp_encoder(coords,
+                coord_mask, padding_mask, confidence)
+        R = get_rotation_frames(coords)
+        # Rotate to local rotation frame for rotation-invariance
+        gvp_out_features = torch.cat([
+            gvp_out_scalars,
+            rotate(gvp_out_vectors, R.transpose(-2, -1)).flatten(-2, -1),
+        ], dim=-1)
+        components["gvp_out"] = self.embed_gvp_output(gvp_out_features)
+        components["confidence"] = self.embed_confidence(
+             rbf(confidence, 0., 1.))
+        # In addition to GVP encoder outputs, also directly embed GVP input node
+        # features to the Transformer
+        scalar_features, vector_features = GVPInputFeaturizer.get_node_features(
+            coords, coord_mask, with_coord_mask=False)
+        features = torch.cat([
+            scalar_features,
+            rotate(vector_features, R.transpose(-2, -1)).flatten(-2, -1),
+        ], dim=-1)
+        components["gvp_input_features"] = self.embed_gvp_input_features(features)
+        embed = sum(components.values())
+        # for k, v in components.items():
+        #     print(k, torch.mean(v, dim=(0,1)), torch.std(v, dim=(0,1)))
+        x = embed
+        x = x + self.embed_positions(mask_tokens)
+        x = self.dropout_module(x)
+        return x, components
+    def forward(
+        self,
+        coords,
+        encoder_padding_mask,
+        confidence,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            coords (Tensor): backbone coordinates
+                shape batch_size x num_residues x num_atoms (3 for N, CA, C) x 3
+            encoder_padding_mask (ByteTensor): the positions of
+                  padding elements of shape `(batch_size x num_residues)`
+            confidence (Tensor): the confidence score of shape (batch_size x
+                num_residues). The value is between 0. and 1. for each residue
+                coordinate, or -1. if no coordinate is given
+            return_all_hiddens (bool, optional): also return all of the
+                intermediate hidden states (default: False).
+        Returns:
+            dict:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(num_residues, batch_size, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch_size, num_residues)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch_size, num_residues, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(num_residues, batch_size, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+        """
+        x, encoder_embedding = self.forward_embedding(coords,
+                encoder_padding_mask, confidence)
+        # account for padding while computing the representation
+        x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x))
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        encoder_states = []
+        if return_all_hiddens:
+            encoder_states.append(x)
+        # encoder layers
+        for layer in self.layers:
+            x = layer(
+                x, encoder_padding_mask=encoder_padding_mask
+            )
+            if return_all_hiddens:
+                assert encoder_states is not None
+                encoder_states.append(x)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        return {
+            "encoder_out": [x],  # T x B x C
+            "encoder_padding_mask": [encoder_padding_mask],  # B x T
+            "encoder_embedding": [encoder_embedding],  # dictionary
+            "encoder_states": encoder_states,  # List[T x B x C]
+        }

esm/esm/inverse_folding/gvp_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def flatten_graph(node_embeddings, edge_embeddings, edge_index):
+    """
+    Flattens the graph into a batch size one (with disconnected subgraphs for
+    each example) to be compatible with pytorch-geometric package.
+    Args:
+        node_embeddings: node embeddings in tuple form (scalar, vector)
+                - scalar: shape batch size x nodes x node_embed_dim
+                - vector: shape batch size x nodes x node_embed_dim x 3
+        edge_embeddings: edge embeddings of in tuple form (scalar, vector)
+                - scalar: shape batch size x edges x edge_embed_dim
+                - vector: shape batch size x edges x edge_embed_dim x 3
+        edge_index: shape batch_size x 2 (source node and target node) x edges
+    Returns:
+        node_embeddings: node embeddings in tuple form (scalar, vector)
+                - scalar: shape batch total_nodes x node_embed_dim
+                - vector: shape batch total_nodes x node_embed_dim x 3
+        edge_embeddings: edge embeddings of in tuple form (scalar, vector)
+                - scalar: shape batch total_edges x edge_embed_dim
+                - vector: shape batch total_edges x edge_embed_dim x 3
+        edge_index: shape 2 x total_edges
+    """
+    x_s, x_v = node_embeddings
+    e_s, e_v = edge_embeddings
+    batch_size, N = x_s.shape[0], x_s.shape[1]
+    node_embeddings = (torch.flatten(x_s, 0, 1), torch.flatten(x_v, 0, 1))
+    edge_embeddings = (torch.flatten(e_s, 0, 1), torch.flatten(e_v, 0, 1))
+    edge_mask = torch.any(edge_index != -1, dim=1)
+    # Re-number the nodes by adding batch_idx * N to each batch
+    edge_index = edge_index + (torch.arange(batch_size, device=edge_index.device) *
+            N).unsqueeze(-1).unsqueeze(-1)
+    edge_index = edge_index.permute(1, 0, 2).flatten(1, 2)
+    edge_mask = edge_mask.flatten()
+    edge_index = edge_index[:, edge_mask]
+    edge_embeddings = (
+        edge_embeddings[0][edge_mask, :],
+        edge_embeddings[1][edge_mask, :]
+    )
+    return node_embeddings, edge_embeddings, edge_index
+def unflatten_graph(node_embeddings, batch_size):
+    """
+    Unflattens node embeddings.
+    Args:
+        node_embeddings: node embeddings in tuple form (scalar, vector)
+                - scalar: shape batch total_nodes x node_embed_dim
+                - vector: shape batch total_nodes x node_embed_dim x 3
+        batch_size: int
+    Returns:
+        node_embeddings: node embeddings in tuple form (scalar, vector)
+                - scalar: shape batch size x nodes x node_embed_dim
+                - vector: shape batch size x nodes x node_embed_dim x 3
+    """
+    x_s, x_v = node_embeddings
+    x_s = x_s.reshape(batch_size, -1, x_s.shape[1])
+    x_v = x_v.reshape(batch_size, -1, x_v.shape[1], x_v.shape[2])
+    return (x_s, x_v)

esm/esm/inverse_folding/multichain_util.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import biotite.structure
+import numpy as np
+import torch
+from typing import Sequence, Tuple, List
+from esm.inverse_folding.util import (
+    load_structure,
+    extract_coords_from_structure,
+    load_coords,
+    get_sequence_loss,
+    get_encoder_output,
+)
+def extract_coords_from_complex(structure: biotite.structure.AtomArray):
+    """
+    Args:
+        structure: biotite AtomArray
+    Returns:
+        Tuple (coords_list, seq_list)
+        - coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
+          coordinates representing the backbone of each chain
+        - seqs: Dictionary mapping chain ids to native sequences of each chain
+    """
+    coords = {}
+    seqs = {}
+    all_chains = biotite.structure.get_chains(structure)
+    for chain_id in all_chains:
+        chain = structure[structure.chain_id == chain_id]
+        coords[chain_id], seqs[chain_id] = extract_coords_from_structure(chain)
+    return coords, seqs
+def load_complex_coords(fpath, chains):
+    """
+    Args:
+        fpath: filepath to either pdb or cif file
+        chains: the chain ids (the order matters for autoregressive model)
+    Returns:
+        Tuple (coords_list, seq_list)
+        - coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
+          coordinates representing the backbone of each chain
+        - seqs: Dictionary mapping chain ids to native sequences of each chain
+    """
+    structure = load_structure(fpath, chains)
+    return extract_coords_from_complex(structure)
+def _concatenate_coords(coords, target_chain_id, padding_length=10):
+    """
+    Args:
+        coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
+            coordinates representing the backbone of each chain
+        target_chain_id: The chain id to sample sequences for
+        padding_length: Length of padding between concatenated chains
+    Returns:
+        Tuple (coords, seq)
+            - coords is an L x 3 x 3 array for N, CA, C coordinates, a
+              concatenation of the chains with padding in between
+            - seq is the extracted sequence, with padding tokens inserted
+              between the concatenated chains
+    """
+    pad_coords = np.full((padding_length, 3, 3), np.nan, dtype=np.float32)
+    # For best performance, put the target chain first in concatenation.
+    coords_list = [coords[target_chain_id]]
+    for chain_id in coords:
+        if chain_id == target_chain_id:
+            continue
+        coords_list.append(pad_coords)
+        coords_list.append(coords[chain_id])
+    coords_concatenated = np.concatenate(coords_list, axis=0)
+    return coords_concatenated
+def sample_sequence_in_complex(model, coords, target_chain_id, temperature=1.,
+        padding_length=10):
+    """
+    Samples sequence for one chain in a complex.
+    Args:
+        model: An instance of the GVPTransformer model
+        coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
+            coordinates representing the backbone of each chain
+        target_chain_id: The chain id to sample sequences for
+        padding_length: padding length in between chains
+    Returns:
+        Sampled sequence for the target chain
+    """
+    target_chain_len = coords[target_chain_id].shape[0]
+    all_coords = _concatenate_coords(coords, target_chain_id)
+    # Supply padding tokens for other chains to avoid unused sampling for speed
+    padding_pattern = ['<pad>'] * all_coords.shape[0]
+    for i in range(target_chain_len):
+        padding_pattern[i] = '<mask>'
+    sampled = model.sample(all_coords, partial_seq=padding_pattern,
+            temperature=temperature)
+    sampled = sampled[:target_chain_len]
+    return sampled
+def score_sequence_in_complex(model, alphabet, coords, target_chain_id,
+        target_seq, padding_length=10):
+    """
+    Scores sequence for one chain in a complex.
+    Args:
+        model: An instance of the GVPTransformer model
+        alphabet: Alphabet for the model
+        coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
+            coordinates representing the backbone of each chain
+        target_chain_id: The chain id to sample sequences for
+        target_seq: Target sequence for the target chain for scoring.
+        padding_length: padding length in between chains
+    Returns:
+        Tuple (ll_fullseq, ll_withcoord)
+        - ll_fullseq: Average log-likelihood over the full target chain
+        - ll_withcoord: Average log-likelihood in target chain excluding those
+            residues without coordinates
+    """
+    all_coords = _concatenate_coords(coords, target_chain_id)
+    loss, target_padding_mask = get_sequence_loss(model, alphabet, all_coords,
+            target_seq)
+    ll_fullseq = -np.sum(loss * ~target_padding_mask) / np.sum(
+            ~target_padding_mask)
+    # Also calculate average when excluding masked portions
+    coord_mask = np.all(np.isfinite(coords[target_chain_id]), axis=(-1, -2))
+    ll_withcoord = -np.sum(loss * coord_mask) / np.sum(coord_mask)
+    return ll_fullseq, ll_withcoord
+def get_encoder_output_for_complex(model, alphabet, coords, target_chain_id):
+    """
+    Args:
+        model: An instance of the GVPTransformer model
+        alphabet: Alphabet for the model
+        coords: Dictionary mapping chain ids to L x 3 x 3 array for N, CA, C
+            coordinates representing the backbone of each chain
+        target_chain_id: The chain id to sample sequences for
+    Returns:
+        Dictionary mapping chain id to encoder output for each chain
+    """
+    all_coords = _concatenate_coords(coords, target_chain_id)
+    all_rep = get_encoder_output(model, alphabet, all_coords)
+    target_chain_len = coords[target_chain_id].shape[0]
+    return all_rep[:target_chain_len]

esm/esm/inverse_folding/transformer_decoder.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Contents of this file were adapted from the open source fairseq repository.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Any, Dict, List, Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from esm.modules import SinusoidalPositionalEmbedding
+from .transformer_layer import TransformerDecoderLayer
+def fill_with_neg_inf(t):
+    """FP16-compatible function that fills a tensor with -inf."""
+    return t.float().fill_(float("-inf")).type_as(t)
+class TransformerDecoder(nn.Module):
+    """
+    Transformer decoder consisting of *args.decoder.layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+    def __init__(
+        self,
+        args,
+        dictionary,
+        embed_tokens,
+    ):
+        super().__init__()
+        self.args = args
+        self.dictionary = dictionary
+        self._future_mask = torch.empty(0)
+        self.dropout_module = nn.Dropout(args.dropout)
+        input_embed_dim = embed_tokens.embedding_dim
+        embed_dim = args.decoder_embed_dim
+        self.embed_dim = embed_dim
+        self.padding_idx = embed_tokens.padding_idx
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(embed_dim)
+        self.project_in_dim = (
+            nn.Linear(input_embed_dim, embed_dim, bias=False)
+            if embed_dim != input_embed_dim
+            else None
+        )
+        self.embed_positions = SinusoidalPositionalEmbedding(
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([])
+        self.layers.extend(
+            [
+                self.build_decoder_layer(args)
+                for _ in range(args.decoder_layers)
+            ]
+        )
+        self.num_layers = len(self.layers)
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        self.build_output_projection(args, dictionary)
+    def build_output_projection(self, args, dictionary):
+        self.output_projection = nn.Linear(
+            args.decoder_embed_dim, len(dictionary), bias=False
+        )
+        nn.init.normal_(
+            self.output_projection.weight, mean=0, std=args.decoder_embed_dim ** -0.5
+        )
+    def build_decoder_layer(self, args):
+        return TransformerDecoderLayer(args)
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (optional): output from the encoder, used for
+                encoder-side attention, should be of size T x B x C
+            incremental_state (dict): dictionary used for storing state during
+                :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False).
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+        )
+        if not features_only:
+            x = self.output_layer(x)
+        x = x.transpose(1, 2) # B x T x C -> B x C x T
+        return x, extra
+    def extract_features(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[Dict[str, List[Tensor]]],
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+    ):
+        """
+        Similar to *forward* but only return features.
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+        Returns:
+            tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - a dictionary with any model-specific outputs
+        """
+        bs, slen = prev_output_tokens.size()
+        enc: Optional[Tensor] = None
+        padding_mask: Optional[Tensor] = None
+        if encoder_out is not None and len(encoder_out["encoder_out"]) > 0:
+            enc = encoder_out["encoder_out"][0]
+            assert (
+                enc.size()[1] == bs
+            ), f"Expected enc.shape == (t, {bs}, c) got {enc.shape}"
+        if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0:
+            padding_mask = encoder_out["encoder_padding_mask"][0]
+        # embed positions
+        positions = self.embed_positions(
+            prev_output_tokens
+        )
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+            positions = positions[:, -1:]
+        # embed tokens and positions
+        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
+        if self.project_in_dim is not None:
+            x = self.project_in_dim(x)
+        x += positions
+        x = self.dropout_module(x)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        self_attn_padding_mask: Optional[Tensor] = None
+        if prev_output_tokens.eq(self.padding_idx).any():
+            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
+        # decoder layers
+        attn: Optional[Tensor] = None
+        inner_states: List[Optional[Tensor]] = [x]
+        for idx, layer in enumerate(self.layers):
+            if incremental_state is None:
+                self_attn_mask = self.buffered_future_mask(x)
+            else:
+                self_attn_mask = None
+            x, layer_attn, _ = layer(
+                x,
+                enc,
+                padding_mask,
+                incremental_state,
+                self_attn_mask=self_attn_mask,
+                self_attn_padding_mask=self_attn_padding_mask,
+                need_attn=False,
+                need_head_weights=False,
+            )
+            inner_states.append(x)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        # T x B x C -> B x C x T
+        x = x.transpose(0, 1)
+        return x, {"inner_states": inner_states}
+    def output_layer(self, features):
+        """Project features to the vocabulary size."""
+        return self.output_projection(features)
+    def buffered_future_mask(self, tensor):
+        dim = tensor.size(0)
+        # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
+        if (
+            self._future_mask.size(0) == 0
+            or (not self._future_mask.device == tensor.device)
+            or self._future_mask.size(0) < dim
+        ):
+            self._future_mask = torch.triu(
+                fill_with_neg_inf(torch.zeros([dim, dim])), 1
+            )
+        self._future_mask = self._future_mask.to(tensor)
+        return self._future_mask[:dim, :dim]

esm/esm/inverse_folding/transformer_layer.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Contents of this file were adapted from the open source fairseq repository.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict, List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from esm.multihead_attention import MultiheadAttention
+from torch import Tensor
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer block.
+    `layernorm -> dropout -> add residual`
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+    """
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.embed_dim = args.encoder_embed_dim
+        self.self_attn = self.build_self_attention(self.embed_dim, args)
+        self.self_attn_layer_norm = torch.nn.LayerNorm(self.embed_dim)
+        self.dropout_module = nn.Dropout(args.dropout)
+        self.activation_fn = F.relu
+        self.fc1 = self.build_fc1(
+            self.embed_dim,
+            args.encoder_ffn_embed_dim,
+        )
+        self.fc2 = self.build_fc2(
+            args.encoder_ffn_embed_dim,
+            self.embed_dim,
+        )
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def build_fc1(self, input_dim, output_dim):
+        return nn.Linear(input_dim, output_dim)
+    def build_fc2(self, input_dim, output_dim):
+        return nn.Linear(input_dim, output_dim)
+    def build_self_attention(self, embed_dim, args):
+        return MultiheadAttention(
+            embed_dim,
+            args.encoder_attention_heads,
+            dropout=args.attention_dropout,
+            self_attention=True,
+        )
+    def residual_connection(self, x, residual):
+        return residual + x
+    def forward(
+        self,
+        x,
+        encoder_padding_mask: Optional[Tensor],
+        attn_mask: Optional[Tensor] = None,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, seq_len)` where padding elements are indicated by ``1``.
+            attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`,
+                where `tgt_len` is the length of output and `src_len` is the
+                length of input, though here both are equal to `seq_len`.
+                `attn_mask[tgt_i, src_j] = 1` means that when calculating the
+                embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
+                useful for strided self-attention.
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        # anything in original attn_mask = 1, becomes -1e8
+        # anything in original attn_mask = 0, becomes 0
+        # Note that we cannot use -inf here, because at some edge cases,
+        # the attention weight (before softmax) for some padded element in query
+        # will become -inf, which results in NaN in model parameters
+        if attn_mask is not None:
+            attn_mask = attn_mask.masked_fill(
+                attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4
+            )
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        x, _ = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=encoder_padding_mask,
+            need_weights=False,
+            attn_mask=attn_mask,
+        )
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        residual = x
+        x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        return x
+class TransformerDecoderLayer(nn.Module):
+    """Decoder layer block.
+    `layernorm -> dropout -> add residual`
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        no_encoder_attn (bool, optional): whether to attend to encoder outputs
+            (default: False).
+    """
+    def __init__(
+        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
+    ):
+        super().__init__()
+        self.embed_dim = args.decoder_embed_dim
+        self.dropout_module = nn.Dropout(args.dropout)
+        self.self_attn = self.build_self_attention(
+            self.embed_dim,
+            args,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+        )
+        self.nh = self.self_attn.num_heads
+        self.head_dim = self.self_attn.head_dim
+        self.activation_fn = F.relu
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        if no_encoder_attn:
+            self.encoder_attn = None
+            self.encoder_attn_layer_norm = None
+        else:
+            self.encoder_attn = self.build_encoder_attention(self.embed_dim, args)
+            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.ffn_layernorm = (
+            LayerNorm(args.decoder_ffn_embed_dim)
+            if getattr(args, "scale_fc", False)
+            else None
+        )
+        self.w_resid = (
+            nn.Parameter(
+                torch.ones(
+                    self.embed_dim,
+                ),
+                requires_grad=True,
+            )
+            if getattr(args, "scale_resids", False)
+            else None
+        )
+        self.fc1 = self.build_fc1(
+            self.embed_dim,
+            args.decoder_ffn_embed_dim,
+        )
+        self.fc2 = self.build_fc2(
+            args.decoder_ffn_embed_dim,
+            self.embed_dim,
+        )
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.need_attn = True
+    def build_fc1(self, input_dim, output_dim):
+        return nn.Linear(input_dim, output_dim)
+    def build_fc2(self, input_dim, output_dim):
+        return nn.Linear(input_dim, output_dim)
+    def build_self_attention(
+        self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
+    ):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            dropout=args.attention_dropout,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=add_zero_attn,
+            self_attention=True,
+        )
+    def build_encoder_attention(self, embed_dim, args):
+        return MultiheadAttention(
+            embed_dim,
+            args.decoder_attention_heads,
+            kdim=args.encoder_embed_dim,
+            vdim=args.encoder_embed_dim,
+            dropout=args.attention_dropout,
+            encoder_decoder_attention=True,
+        )
+    def residual_connection(self, x, residual):
+        return residual + x
+    def forward(
+        self,
+        x,
+        encoder_out: Optional[torch.Tensor] = None,
+        encoder_padding_mask: Optional[torch.Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        prev_self_attn_state: Optional[List[torch.Tensor]] = None,
+        prev_attn_state: Optional[List[torch.Tensor]] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+        need_attn: bool = False,
+        need_head_weights: bool = False,
+    ):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor, optional): binary
+                ByteTensor of shape `(batch, src_len)` where padding
+                elements are indicated by ``1``.
+            need_attn (bool, optional): return attention weights
+            need_head_weights (bool, optional): return attention weights
+                for each head (default: return average over heads).
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        if need_head_weights:
+            need_attn = True
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        if prev_self_attn_state is not None:
+            prev_key, prev_value = prev_self_attn_state[:2]
+            saved_state: Dict[str, Optional[Tensor]] = {
+                "prev_key": prev_key,
+                "prev_value": prev_value,
+            }
+            if len(prev_self_attn_state) >= 3:
+                saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
+            assert incremental_state is not None
+            self.self_attn._set_input_buffer(incremental_state, saved_state)
+        _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state)
+        y = x
+        x, attn = self.self_attn(
+            query=x,
+            key=y,
+            value=y,
+            key_padding_mask=self_attn_padding_mask,
+            incremental_state=incremental_state,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        x = self.dropout_module(x)
+        x = self.residual_connection(x, residual)
+        if self.encoder_attn is not None and encoder_out is not None:
+            residual = x
+            x = self.encoder_attn_layer_norm(x)
+            if prev_attn_state is not None:
+                prev_key, prev_value = prev_attn_state[:2]
+                saved_state: Dict[str, Optional[Tensor]] = {
+                    "prev_key": prev_key,
+                    "prev_value": prev_value,
+                }
+                if len(prev_attn_state) >= 3:
+                    saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+                assert incremental_state is not None
+                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+            x, attn = self.encoder_attn(
+                query=x,
+                key=encoder_out,
+                value=encoder_out,
+                key_padding_mask=encoder_padding_mask,
+                incremental_state=incremental_state,
+                static_kv=True,
+                need_weights=need_attn or (not self.training and self.need_attn),
+                need_head_weights=need_head_weights,
+            )
+            x = self.dropout_module(x)
+            x = self.residual_connection(x, residual)
+        residual = x
+        x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        if self.ffn_layernorm is not None:
+            x = self.ffn_layernorm(x)
+        x = self.fc2(x)
+        x = self.dropout_module(x)
+        if self.w_resid is not None:
+            residual = torch.mul(self.w_resid, residual)
+        x = self.residual_connection(x, residual)
+        return x, attn, None

esm/esm/inverse_folding/util.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import math
+import biotite.structure
+from biotite.structure.io import pdbx, pdb
+from biotite.structure.residues import get_residues
+from biotite.structure import filter_backbone
+from biotite.structure import get_chains
+from biotite.sequence import ProteinSequence
+import numpy as np
+from scipy.spatial import transform
+from scipy.stats import special_ortho_group
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.data as data
+from typing import Sequence, Tuple, List
+from esm.data import BatchConverter
+def load_structure(fpath, chain=None):
+    """
+    Args:
+        fpath: filepath to either pdb or cif file
+        chain: the chain id or list of chain ids to load
+    Returns:
+        biotite.structure.AtomArray
+    """
+    if fpath.endswith('cif'):
+        with open(fpath) as fin:
+            pdbxf = pdbx.PDBxFile.read(fin)
+        structure = pdbx.get_structure(pdbxf, model=1)
+    elif fpath.endswith('pdb'):
+        with open(fpath) as fin:
+            pdbf = pdb.PDBFile.read(fin)
+        structure = pdb.get_structure(pdbf, model=1)
+    bbmask = filter_backbone(structure)
+    structure = structure[bbmask]
+    all_chains = get_chains(structure)
+    if len(all_chains) == 0:
+        raise ValueError('No chains found in the input file.')
+    if chain is None:
+        chain_ids = all_chains
+    elif isinstance(chain, list):
+        chain_ids = chain
+    else:
+        chain_ids = [chain]
+    for chain in chain_ids:
+        if chain not in all_chains:
+            raise ValueError(f'Chain {chain} not found in input file')
+    chain_filter = [a.chain_id in chain_ids for a in structure]
+    structure = structure[chain_filter]
+    return structure
+def extract_coords_from_structure(structure: biotite.structure.AtomArray):
+    """
+    Args:
+        structure: An instance of biotite AtomArray
+    Returns:
+        Tuple (coords, seq)
+            - coords is an L x 3 x 3 array for N, CA, C coordinates
+            - seq is the extracted sequence
+    """
+    coords = get_atom_coords_residuewise(["N", "CA", "C"], structure)
+    residue_identities = get_residues(structure)[1]
+    seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities])
+    return coords, seq
+def load_coords(fpath, chain):
+    """
+    Args:
+        fpath: filepath to either pdb or cif file
+        chain: the chain id
+    Returns:
+        Tuple (coords, seq)
+            - coords is an L x 3 x 3 array for N, CA, C coordinates
+            - seq is the extracted sequence
+    """
+    structure = load_structure(fpath, chain)
+    return extract_coords_from_structure(structure)
+def get_atom_coords_residuewise(atoms: List[str], struct: biotite.structure.AtomArray):
+    """
+    Example for atoms argument: ["N", "CA", "C"]
+    """
+    def filterfn(s, axis=None):
+        filters = np.stack([s.atom_name == name for name in atoms], axis=1)
+        sum = filters.sum(0)
+        if not np.all(sum <= np.ones(filters.shape[1])):
+            raise RuntimeError("structure has multiple atoms with same name")
+        index = filters.argmax(0)
+        coords = s[index].coord
+        coords[sum == 0] = float("nan")
+        return coords
+    return biotite.structure.apply_residue_wise(struct, struct, filterfn)
+def get_sequence_loss(model, alphabet, coords, seq):
+    batch_converter = CoordBatchConverter(alphabet)
+    batch = [(coords, None, seq)]
+    coords, confidence, strs, tokens, padding_mask = batch_converter(batch)
+    prev_output_tokens = tokens[:, :-1]
+    target = tokens[:, 1:]
+    target_padding_mask = (target == alphabet.padding_idx)
+    logits, _ = model.forward(coords, padding_mask, confidence, prev_output_tokens)
+    loss = F.cross_entropy(logits, target, reduction='none')
+    loss = loss[0].detach().numpy()
+    target_padding_mask = target_padding_mask[0].numpy()
+    return loss, target_padding_mask
+def score_sequence(model, alphabet, coords, seq):
+    loss, target_padding_mask = get_sequence_loss(model, alphabet, coords, seq)
+    ll_fullseq = -np.sum(loss * ~target_padding_mask) / np.sum(~target_padding_mask)
+    # Also calculate average when excluding masked portions
+    coord_mask = np.all(np.isfinite(coords), axis=(-1, -2))
+    ll_withcoord = -np.sum(loss * coord_mask) / np.sum(coord_mask)
+    return ll_fullseq, ll_withcoord
+def get_encoder_output(model, alphabet, coords):
+    batch_converter = CoordBatchConverter(alphabet)
+    # the batch_converter is essential for forming the correct input format
+    batch = [(coords, None, None)]
+    coords, confidence, _, _, padding_mask = batch_converter(batch)
+    encoder_out = model.encoder.forward(coords, padding_mask, confidence,
+            return_all_hiddens=False)
+    # remove beginning and end (bos and eos tokens)
+    return encoder_out['encoder_out'][0][1:-1, 0]
+def rotate(v, R):
+    """
+    Rotates a vector by a rotation matrix.
+    Args:
+        v: 3D vector, tensor of shape (length x batch_size x channels x 3)
+        R: rotation matrix, tensor of shape (length x batch_size x 3 x 3)
+    Returns:
+        Rotated version of v by rotation matrix R.
+    """
+    R = R.unsqueeze(-3)
+    v = v.unsqueeze(-1)
+    return torch.sum(v * R, dim=-2)
+def get_rotation_frames(coords):
+    """
+    Returns a local rotation frame defined by N, CA, C positions.
+    Args:
+        coords: coordinates, tensor of shape (batch_size x length x 3 x 3)
+        where the third dimension is in order of N, CA, C
+    Returns:
+        Local relative rotation frames in shape (batch_size x length x 3 x 3)
+    """
+    v1 = coords[:, :, 2] - coords[:, :, 1]
+    v2 = coords[:, :, 0] - coords[:, :, 1]
+    e1 = normalize(v1, dim=-1)
+    u2 = v2 - e1 * torch.sum(e1 * v2, dim=-1, keepdim=True)
+    e2 = normalize(u2, dim=-1)
+    e3 = torch.cross(e1, e2, dim=-1)
+    R = torch.stack([e1, e2, e3], dim=-2)
+    return R
+def nan_to_num(ts, val=0.0):
+    """
+    Replaces nans in tensor with a fixed value.
+    """
+    val = torch.tensor(val, dtype=ts.dtype, device=ts.device)
+    return torch.where(~torch.isfinite(ts), val, ts)
+def rbf(values, v_min, v_max, n_bins=16):
+    """
+    Returns RBF encodings in a new dimension at the end.
+    """
+    rbf_centers = torch.linspace(v_min, v_max, n_bins, device=values.device)
+    rbf_centers = rbf_centers.view([1] * len(values.shape) + [-1])
+    rbf_std = (v_max - v_min) / n_bins
+    v_expand = torch.unsqueeze(values, -1)
+    z = (values.unsqueeze(-1) - rbf_centers) / rbf_std
+    return torch.exp(-z ** 2)
+def norm(tensor, dim, eps=1e-8, keepdim=False):
+    """
+    Returns L2 norm along a dimension.
+    """
+    return torch.sqrt(
+            torch.sum(torch.square(tensor), dim=dim, keepdim=keepdim) + eps)
+def normalize(tensor, dim=-1):
+    """
+    Normalizes a tensor along a dimension after removing nans.
+    """
+    return nan_to_num(
+        torch.div(tensor, norm(tensor, dim=dim, keepdim=True))
+    )
+class CoordBatchConverter(BatchConverter):
+    def __call__(self, raw_batch: Sequence[Tuple[Sequence, str]], device=None):
+        """
+        Args:
+            raw_batch: List of tuples (coords, confidence, seq)
+            In each tuple,
+                coords: list of floats, shape L x 3 x 3
+                confidence: list of floats, shape L; or scalar float; or None
+                seq: string of length L
+        Returns:
+            coords: Tensor of shape batch_size x L x 3 x 3
+            confidence: Tensor of shape batch_size x L
+            strs: list of strings
+            tokens: LongTensor of shape batch_size x L
+            padding_mask: ByteTensor of shape batch_size x L
+        """
+        self.alphabet.cls_idx = self.alphabet.get_idx("<cath>")
+        batch = []
+        for coords, confidence, seq in raw_batch:
+            if confidence is None:
+                confidence = 1.
+            if isinstance(confidence, float) or isinstance(confidence, int):
+                confidence = [float(confidence)] * len(coords)
+            if seq is None:
+                seq = 'X' * len(coords)
+            batch.append(((coords, confidence), seq))
+        coords_and_confidence, strs, tokens = super().__call__(batch)
+        # pad beginning and end of each protein due to legacy reasons
+        coords = [
+            F.pad(torch.tensor(cd), (0, 0, 0, 0, 1, 1), value=np.inf)
+            for cd, _ in coords_and_confidence
+        ]
+        confidence = [
+            F.pad(torch.tensor(cf), (1, 1), value=-1.)
+            for _, cf in coords_and_confidence
+        ]
+        coords = self.collate_dense_tensors(coords, pad_v=np.nan)
+        confidence = self.collate_dense_tensors(confidence, pad_v=-1.)
+        if device is not None:
+            coords = coords.to(device)
+            confidence = confidence.to(device)
+            tokens = tokens.to(device)
+        padding_mask = torch.isnan(coords[:,:,0,0])
+        coord_mask = torch.isfinite(coords.sum(-2).sum(-1))
+        confidence = confidence * coord_mask + (-1.) * padding_mask
+        return coords, confidence, strs, tokens, padding_mask
+    def from_lists(self, coords_list, confidence_list=None, seq_list=None, device=None):
+        """
+        Args:
+            coords_list: list of length batch_size, each item is a list of
+            floats in shape L x 3 x 3 to describe a backbone
+            confidence_list: one of
+                - None, default to highest confidence
+                - list of length batch_size, each item is a scalar
+                - list of length batch_size, each item is a list of floats of
+                    length L to describe the confidence scores for the backbone
+                    with values between 0. and 1.
+            seq_list: either None or a list of strings
+        Returns:
+            coords: Tensor of shape batch_size x L x 3 x 3
+            confidence: Tensor of shape batch_size x L
+            strs: list of strings
+            tokens: LongTensor of shape batch_size x L
+            padding_mask: ByteTensor of shape batch_size x L
+        """
+        batch_size = len(coords_list)
+        if confidence_list is None:
+            confidence_list = [None] * batch_size
+        if seq_list is None:
+            seq_list = [None] * batch_size
+        raw_batch = zip(coords_list, confidence_list, seq_list)
+        return self.__call__(raw_batch, device)
+    @staticmethod
+    def collate_dense_tensors(samples, pad_v):
+        """
+        Takes a list of tensors with the following dimensions:
+            [(d_11,       ...,           d_1K),
+             (d_21,       ...,           d_2K),
+             ...,
+             (d_N1,       ...,           d_NK)]
+        and stack + pads them into a single tensor of:
+        (N, max_i=1,N { d_i1 }, ..., max_i=1,N {diK})
+        """
+        if len(samples) == 0:
+            return torch.Tensor()
+        if len(set(x.dim() for x in samples)) != 1:
+            raise RuntimeError(
+                f"Samples has varying dimensions: {[x.dim() for x in samples]}"
+            )
+        (device,) = tuple(set(x.device for x in samples))  # assumes all on same device
+        max_shape = [max(lst) for lst in zip(*[x.shape for x in samples])]
+        result = torch.empty(
+            len(samples), *max_shape, dtype=samples[0].dtype, device=device
+        )
+        result.fill_(pad_v)
+        for i in range(len(samples)):
+            result_i = result[i]
+            t = samples[i]
+            result_i[tuple(slice(0, k) for k in t.shape)] = t
+        return result

esm/esm/model/esm1.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..modules import (
+    TransformerLayer,
+    LearnedPositionalEmbedding,
+    SinusoidalPositionalEmbedding,
+    RobertaLMHead,
+    ESM1bLayerNorm,
+    ContactPredictionHead,
+)
+class ProteinBertModel(nn.Module):
+    @classmethod
+    def add_args(cls, parser):
+        parser.add_argument(
+            "--num_layers", default=36, type=int, metavar="N", help="number of layers"
+        )
+        parser.add_argument(
+            "--embed_dim", default=1280, type=int, metavar="N", help="embedding dimension"
+        )
+        parser.add_argument(
+            "--logit_bias", action="store_true", help="whether to apply bias to logits"
+        )
+        parser.add_argument(
+            "--ffn_embed_dim",
+            default=5120,
+            type=int,
+            metavar="N",
+            help="embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--attention_heads",
+            default=20,
+            type=int,
+            metavar="N",
+            help="number of attention heads",
+        )
+    def __init__(self, args, alphabet):
+        super().__init__()
+        self.args = args
+        self.alphabet_size = len(alphabet)
+        self.padding_idx = alphabet.padding_idx
+        self.mask_idx = alphabet.mask_idx
+        self.cls_idx = alphabet.cls_idx
+        self.eos_idx = alphabet.eos_idx
+        self.prepend_bos = alphabet.prepend_bos
+        self.append_eos = alphabet.append_eos
+        self.emb_layer_norm_before = getattr(self.args, "emb_layer_norm_before", False)
+        if self.args.arch == "roberta_large":
+            self.model_version = "ESM-1b"
+            self._init_submodules_esm1b()
+        else:
+            self.model_version = "ESM-1"
+            self._init_submodules_esm1()
+    def _init_submodules_common(self):
+        self.embed_tokens = nn.Embedding(
+            self.alphabet_size, self.args.embed_dim, padding_idx=self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(
+                    self.args.embed_dim,
+                    self.args.ffn_embed_dim,
+                    self.args.attention_heads,
+                    add_bias_kv=(self.model_version != "ESM-1b"),
+                    use_esm1b_layer_norm=(self.model_version == "ESM-1b"),
+                )
+                for _ in range(self.args.layers)
+            ]
+        )
+        self.contact_head = ContactPredictionHead(
+            self.args.layers * self.args.attention_heads,
+            self.prepend_bos,
+            self.append_eos,
+            eos_idx=self.eos_idx,
+        )
+    def _init_submodules_esm1b(self):
+        self._init_submodules_common()
+        self.embed_scale = 1
+        self.embed_positions = LearnedPositionalEmbedding(
+            self.args.max_positions, self.args.embed_dim, self.padding_idx
+        )
+        self.emb_layer_norm_before = (
+            ESM1bLayerNorm(self.args.embed_dim) if self.emb_layer_norm_before else None
+        )
+        self.emb_layer_norm_after = ESM1bLayerNorm(self.args.embed_dim)
+        self.lm_head = RobertaLMHead(
+            embed_dim=self.args.embed_dim,
+            output_dim=self.alphabet_size,
+            weight=self.embed_tokens.weight,
+        )
+    def _init_submodules_esm1(self):
+        self._init_submodules_common()
+        self.embed_scale = math.sqrt(self.args.embed_dim)
+        self.embed_positions = SinusoidalPositionalEmbedding(self.args.embed_dim, self.padding_idx)
+        self.embed_out = nn.Parameter(torch.zeros((self.alphabet_size, self.args.embed_dim)))
+        self.embed_out_bias = None
+        if self.args.final_bias:
+            self.embed_out_bias = nn.Parameter(torch.zeros(self.alphabet_size))
+    def forward(self, tokens, repr_layers=[], need_head_weights=False, return_contacts=False):
+        if return_contacts:
+            need_head_weights = True
+        assert tokens.ndim == 2
+        padding_mask = tokens.eq(self.padding_idx)  # B, T
+        x = self.embed_scale * self.embed_tokens(tokens)
+        if getattr(self.args, "token_dropout", False):
+            x.masked_fill_((tokens == self.mask_idx).unsqueeze(-1), 0.0)
+            # x: B x T x C
+            mask_ratio_train = 0.15 * 0.8
+            src_lengths = (~padding_mask).sum(-1)
+            mask_ratio_observed = (tokens == self.mask_idx).sum(-1).float() / src_lengths
+            x = x * (1 - mask_ratio_train) / (1 - mask_ratio_observed)[:, None, None]
+        x = x + self.embed_positions(tokens)
+        if self.model_version == "ESM-1b":
+            if self.emb_layer_norm_before:
+                x = self.emb_layer_norm_before(x)
+            if padding_mask is not None:
+                x = x * (1 - padding_mask.unsqueeze(-1).type_as(x))
+        repr_layers = set(repr_layers)
+        hidden_representations = {}
+        if 0 in repr_layers:
+            hidden_representations[0] = x
+        if need_head_weights:
+            attn_weights = []
+        # (B, T, E) => (T, B, E)
+        x = x.transpose(0, 1)
+        if not padding_mask.any():
+            padding_mask = None
+        for layer_idx, layer in enumerate(self.layers):
+            x, attn = layer(
+                x, self_attn_padding_mask=padding_mask, need_head_weights=need_head_weights
+            )
+            if (layer_idx + 1) in repr_layers:
+                hidden_representations[layer_idx + 1] = x.transpose(0, 1)
+            if need_head_weights:
+                # (H, B, T, T) => (B, H, T, T)
+                attn_weights.append(attn.transpose(1, 0))
+        if self.model_version == "ESM-1b":
+            x = self.emb_layer_norm_after(x)
+            x = x.transpose(0, 1)  # (T, B, E) => (B, T, E)
+            # last hidden representation should have layer norm applied
+            if (layer_idx + 1) in repr_layers:
+                hidden_representations[layer_idx + 1] = x
+            x = self.lm_head(x)
+        else:
+            x = F.linear(x, self.embed_out, bias=self.embed_out_bias)
+            x = x.transpose(0, 1)  # (T, B, E) => (B, T, E)
+        result = {"logits": x, "representations": hidden_representations}
+        if need_head_weights:
+            # attentions: B x L x H x T x T
+            attentions = torch.stack(attn_weights, 1)
+            if self.model_version == "ESM-1":
+                # ESM-1 models have an additional null-token for attention, which we remove
+                attentions = attentions[..., :-1]
+            if padding_mask is not None:
+                attention_mask = 1 - padding_mask.type_as(attentions)
+                attention_mask = attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2)
+                attentions = attentions * attention_mask[:, None, None, :, :]
+            result["attentions"] = attentions
+            if return_contacts:
+                contacts = self.contact_head(tokens, attentions)
+                result["contacts"] = contacts
+        return result
+    def predict_contacts(self, tokens):
+        return self(tokens, return_contacts=True)["contacts"]
+    @property
+    def num_layers(self):
+        return self.args.layers

esm/esm/model/esm2.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Union
+import torch
+import torch.nn as nn
+import esm
+from esm.modules import ContactPredictionHead, ESM1bLayerNorm, RobertaLMHead, TransformerLayer
+class ESM2(nn.Module):
+    def __init__(
+        self,
+        num_layers: int = 33,
+        embed_dim: int = 1280,
+        attention_heads: int = 20,
+        alphabet: Union[esm.data.Alphabet, str] = "ESM-1b",
+        token_dropout: bool = True,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.embed_dim = embed_dim
+        self.attention_heads = attention_heads
+        if not isinstance(alphabet, esm.data.Alphabet):
+            alphabet = esm.data.Alphabet.from_architecture(alphabet)
+        self.alphabet = alphabet
+        self.alphabet_size = len(alphabet)
+        self.padding_idx = alphabet.padding_idx
+        self.mask_idx = alphabet.mask_idx
+        self.cls_idx = alphabet.cls_idx
+        self.eos_idx = alphabet.eos_idx
+        self.prepend_bos = alphabet.prepend_bos
+        self.append_eos = alphabet.append_eos
+        self.token_dropout = token_dropout
+        self._init_submodules()
+    def _init_submodules(self):
+        self.embed_scale = 1
+        self.embed_tokens = nn.Embedding(
+            self.alphabet_size,
+            self.embed_dim,
+            padding_idx=self.padding_idx,
+        )
+        self.layers = nn.ModuleList(
+            [
+                TransformerLayer(
+                    self.embed_dim,
+                    4 * self.embed_dim,
+                    self.attention_heads,
+                    add_bias_kv=False,
+                    use_esm1b_layer_norm=True,
+                    use_rotary_embeddings=True,
+                )
+                for _ in range(self.num_layers)
+            ]
+        )
+        self.contact_head = ContactPredictionHead(
+            self.num_layers * self.attention_heads,
+            self.prepend_bos,
+            self.append_eos,
+            eos_idx=self.eos_idx,
+        )
+        self.emb_layer_norm_after = ESM1bLayerNorm(self.embed_dim)
+        self.lm_head = RobertaLMHead(
+            embed_dim=self.embed_dim,
+            output_dim=self.alphabet_size,
+            weight=self.embed_tokens.weight,
+        )
+    def forward(self, tokens, repr_layers=[], need_head_weights=False, return_contacts=False):
+        if return_contacts:
+            need_head_weights = True
+        assert tokens.ndim == 2
+        padding_mask = tokens.eq(self.padding_idx)  # B, T
+        x = self.embed_scale * self.embed_tokens(tokens)
+        if self.token_dropout:
+            x.masked_fill_((tokens == self.mask_idx).unsqueeze(-1), 0.0)
+            # x: B x T x C
+            mask_ratio_train = 0.15 * 0.8
+            src_lengths = (~padding_mask).sum(-1)
+            mask_ratio_observed = (tokens == self.mask_idx).sum(-1).to(x.dtype) / src_lengths
+            x = x * (1 - mask_ratio_train) / (1 - mask_ratio_observed)[:, None, None]
+        if padding_mask is not None:
+            x = x * (1 - padding_mask.unsqueeze(-1).type_as(x))
+        repr_layers = set(repr_layers)
+        hidden_representations = {}
+        if 0 in repr_layers:
+            hidden_representations[0] = x
+        if need_head_weights:
+            attn_weights = []
+        # (B, T, E) => (T, B, E)
+        x = x.transpose(0, 1)
+        if not padding_mask.any():
+            padding_mask = None
+        for layer_idx, layer in enumerate(self.layers):
+            x, attn = layer(
+                x,
+                self_attn_padding_mask=padding_mask,
+                need_head_weights=need_head_weights,
+            )
+            if (layer_idx + 1) in repr_layers:
+                hidden_representations[layer_idx + 1] = x.transpose(0, 1)
+            if need_head_weights:
+                # (H, B, T, T) => (B, H, T, T)
+                attn_weights.append(attn.transpose(1, 0))
+        x = self.emb_layer_norm_after(x)
+        x = x.transpose(0, 1)  # (T, B, E) => (B, T, E)
+        # last hidden representation should have layer norm applied
+        if (layer_idx + 1) in repr_layers:
+            hidden_representations[layer_idx + 1] = x
+        x = self.lm_head(x)
+        result = {"logits": x, "representations": hidden_representations}
+        if need_head_weights:
+            # attentions: B x L x H x T x T
+            attentions = torch.stack(attn_weights, 1)
+            if padding_mask is not None:
+                attention_mask = 1 - padding_mask.type_as(attentions)
+                attention_mask = attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2)
+                attentions = attentions * attention_mask[:, None, None, :, :]
+            result["attentions"] = attentions
+            if return_contacts:
+                contacts = self.contact_head(tokens, attentions)
+                result["contacts"] = contacts
+        return result
+    def predict_contacts(self, tokens):
+        return self(tokens, return_contacts=True)["contacts"]

esm/esm/model/msa_transformer.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from ..modules import (
+    AxialTransformerLayer,
+    LearnedPositionalEmbedding,
+    RobertaLMHead,
+    ESM1bLayerNorm,
+    ContactPredictionHead,
+)
+from ..axial_attention import RowSelfAttention, ColumnSelfAttention
+class MSATransformer(nn.Module):
+    @classmethod
+    def add_args(cls, parser):
+        # fmt: off
+        parser.add_argument(
+            "--num_layers",
+            default=12,
+            type=int,
+            metavar="N",
+            help="number of layers"
+        )
+        parser.add_argument(
+            "--embed_dim",
+            default=768,
+            type=int,
+            metavar="N",
+            help="embedding dimension"
+        )
+        parser.add_argument(
+            "--logit_bias",
+            action="store_true",
+            help="whether to apply bias to logits"
+        )
+        parser.add_argument(
+            "--ffn_embed_dim",
+            default=3072,
+            type=int,
+            metavar="N",
+            help="embedding dimension for FFN",
+        )
+        parser.add_argument(
+            "--attention_heads",
+            default=12,
+            type=int,
+            metavar="N",
+            help="number of attention heads",
+        )
+        parser.add_argument(
+            "--dropout",
+            default=0.1,
+            type=float,
+            help="Dropout to apply."
+        )
+        parser.add_argument(
+            "--attention_dropout",
+            default=0.1,
+            type=float,
+            help="Dropout to apply."
+        )
+        parser.add_argument(
+            "--activation_dropout",
+            default=0.1,
+            type=float,
+            help="Dropout to apply."
+        )
+        parser.add_argument(
+            "--max_tokens_per_msa",
+            default=2 ** 14,
+            type=int,
+            help=(
+                "Used during inference to batch attention computations in a single "
+                "forward pass. This allows increased input sizes with less memory."
+            ),
+        )
+        # fmt: on
+    def __init__(self, args, alphabet):
+        super().__init__()
+        self.args = args
+        self.alphabet_size = len(alphabet)
+        self.padding_idx = alphabet.padding_idx
+        self.mask_idx = alphabet.mask_idx
+        self.cls_idx = alphabet.cls_idx
+        self.eos_idx = alphabet.eos_idx
+        self.prepend_bos = alphabet.prepend_bos
+        self.append_eos = alphabet.append_eos
+        self.embed_tokens = nn.Embedding(
+            self.alphabet_size, self.args.embed_dim, padding_idx=self.padding_idx
+        )
+        if getattr(self.args, "embed_positions_msa", False):
+            emb_dim = getattr(self.args, "embed_positions_msa_dim", self.args.embed_dim)
+            self.msa_position_embedding = nn.Parameter(
+                0.01 * torch.randn(1, 1024, 1, emb_dim),
+                requires_grad=True,
+            )
+        else:
+            self.register_parameter("msa_position_embedding", None)
+        self.dropout_module = nn.Dropout(self.args.dropout)
+        self.layers = nn.ModuleList(
+            [
+                AxialTransformerLayer(
+                    self.args.embed_dim,
+                    self.args.ffn_embed_dim,
+                    self.args.attention_heads,
+                    self.args.dropout,
+                    self.args.attention_dropout,
+                    self.args.activation_dropout,
+                    getattr(self.args, "max_tokens_per_msa", self.args.max_tokens),
+                )
+                for _ in range(self.args.layers)
+            ]
+        )
+        self.contact_head = ContactPredictionHead(
+            self.args.layers * self.args.attention_heads,
+            self.prepend_bos,
+            self.append_eos,
+            eos_idx=self.eos_idx,
+        )
+        self.embed_positions = LearnedPositionalEmbedding(
+            self.args.max_positions,
+            self.args.embed_dim,
+            self.padding_idx,
+        )
+        self.emb_layer_norm_before = ESM1bLayerNorm(self.args.embed_dim)
+        self.emb_layer_norm_after = ESM1bLayerNorm(self.args.embed_dim)
+        self.lm_head = RobertaLMHead(
+            embed_dim=self.args.embed_dim,
+            output_dim=self.alphabet_size,
+            weight=self.embed_tokens.weight,
+        )
+    def forward(self, tokens, repr_layers=[], need_head_weights=False, return_contacts=False):
+        if return_contacts:
+            need_head_weights = True
+        assert tokens.ndim == 3
+        batch_size, num_alignments, seqlen = tokens.size()
+        padding_mask = tokens.eq(self.padding_idx)  # B, R, C
+        if not padding_mask.any():
+            padding_mask = None
+        x = self.embed_tokens(tokens)
+        x += self.embed_positions(tokens.view(batch_size * num_alignments, seqlen)).view(x.size())
+        if self.msa_position_embedding is not None:
+            if x.size(1) > 1024:
+                raise RuntimeError(
+                    "Using model with MSA position embedding trained on maximum MSA "
+                    f"depth of 1024, but received {x.size(1)} alignments."
+                )
+            x += self.msa_position_embedding[:, :num_alignments]
+        x = self.emb_layer_norm_before(x)
+        x = self.dropout_module(x)
+        if padding_mask is not None:
+            x = x * (1 - padding_mask.unsqueeze(-1).type_as(x))
+        repr_layers = set(repr_layers)
+        hidden_representations = {}
+        if 0 in repr_layers:
+            hidden_representations[0] = x
+        if need_head_weights:
+            row_attn_weights = []
+            col_attn_weights = []
+        # B x R x C x D -> R x C x B x D
+        x = x.permute(1, 2, 0, 3)
+        for layer_idx, layer in enumerate(self.layers):
+            x = layer(
+                x,
+                self_attn_padding_mask=padding_mask,
+                need_head_weights=need_head_weights,
+            )
+            if need_head_weights:
+                x, col_attn, row_attn = x
+                # H x C x B x R x R -> B x H x C x R x R
+                col_attn_weights.append(col_attn.permute(2, 0, 1, 3, 4))
+                # H x B x C x C -> B x H x C x C
+                row_attn_weights.append(row_attn.permute(1, 0, 2, 3))
+            if (layer_idx + 1) in repr_layers:
+                hidden_representations[layer_idx + 1] = x.permute(2, 0, 1, 3)
+        x = self.emb_layer_norm_after(x)
+        x = x.permute(2, 0, 1, 3)  # R x C x B x D -> B x R x C x D
+        # last hidden representation should have layer norm applied
+        if (layer_idx + 1) in repr_layers:
+            hidden_representations[layer_idx + 1] = x
+        x = self.lm_head(x)
+        result = {"logits": x, "representations": hidden_representations}
+        if need_head_weights:
+            # col_attentions: B x L x H x C x R x R
+            col_attentions = torch.stack(col_attn_weights, 1)
+            # row_attentions: B x L x H x C x C
+            row_attentions = torch.stack(row_attn_weights, 1)
+            result["col_attentions"] = col_attentions
+            result["row_attentions"] = row_attentions
+            if return_contacts:
+                contacts = self.contact_head(tokens, row_attentions)
+                result["contacts"] = contacts
+        return result
+    def predict_contacts(self, tokens):
+        return self(tokens, return_contacts=True)["contacts"]
+    @property
+    def num_layers(self):
+        return self.args.layers
+    def max_tokens_per_msa_(self, value: int) -> None:
+        """The MSA Transformer automatically batches attention computations when
+        gradients are disabled to allow you to pass in larger MSAs at test time than
+        you can fit in GPU memory. By default this occurs when more than 2^14 tokens
+        are passed in the input MSA. You can set this value to infinity to disable
+        this behavior.
+        """
+        for module in self.modules():
+            if isinstance(module, (RowSelfAttention, ColumnSelfAttention)):
+                module.max_tokens_per_msa = value

esm/esm/modules.py ADDED Viewed

	@@ -0,0 +1,418 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .multihead_attention import MultiheadAttention  # noqa
+from .axial_attention import ColumnSelfAttention, RowSelfAttention
+def gelu(x):
+    """Implementation of the gelu activation function.
+    For information: OpenAI GPT's gelu is slightly different
+    (and gives slightly different results):
+    0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def symmetrize(x):
+    "Make layer symmetric in final two dimensions, used for contact prediction."
+    return x + x.transpose(-1, -2)
+def apc(x):
+    "Perform average product correct, used for contact prediction."
+    a1 = x.sum(-1, keepdims=True)
+    a2 = x.sum(-2, keepdims=True)
+    a12 = x.sum((-1, -2), keepdims=True)
+    avg = a1 * a2
+    avg.div_(a12)  # in-place to reduce memory
+    normalized = x - avg
+    return normalized
+class ESM1LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12, affine=True):
+        """Construct a layernorm layer in the TF style (eps inside the sqrt)."""
+        super().__init__()
+        self.hidden_size = (hidden_size,) if isinstance(hidden_size, int) else tuple(hidden_size)
+        self.eps = eps
+        self.affine = bool(affine)
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+        else:
+            self.weight, self.bias = None, None
+    def forward(self, x):
+        dims = tuple(-(i + 1) for i in range(len(self.hidden_size)))
+        means = x.mean(dims, keepdim=True)
+        x_zeromean = x - means
+        variances = x_zeromean.pow(2).mean(dims, keepdim=True)
+        x = x_zeromean / torch.sqrt(variances + self.eps)
+        if self.affine:
+            x = (self.weight * x) + self.bias
+        return x
+try:
+    from apex.normalization import FusedLayerNorm as _FusedLayerNorm
+    class ESM1bLayerNorm(_FusedLayerNorm):
+        @torch.jit.unused
+        def forward(self, x):
+            if not x.is_cuda:
+                return super().forward(x)
+            else:
+                with torch.cuda.device(x.device):
+                    return super().forward(x)
+except ImportError:
+    from torch.nn import LayerNorm as ESM1bLayerNorm
+class TransformerLayer(nn.Module):
+    """Transformer layer block."""
+    def __init__(
+        self,
+        embed_dim,
+        ffn_embed_dim,
+        attention_heads,
+        add_bias_kv=True,
+        use_esm1b_layer_norm=False,
+        use_rotary_embeddings: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.ffn_embed_dim = ffn_embed_dim
+        self.attention_heads = attention_heads
+        self.use_rotary_embeddings = use_rotary_embeddings
+        self._init_submodules(add_bias_kv, use_esm1b_layer_norm)
+    def _init_submodules(self, add_bias_kv, use_esm1b_layer_norm):
+        BertLayerNorm = ESM1bLayerNorm if use_esm1b_layer_norm else ESM1LayerNorm
+        self.self_attn = MultiheadAttention(
+            self.embed_dim,
+            self.attention_heads,
+            add_bias_kv=add_bias_kv,
+            add_zero_attn=False,
+            use_rotary_embeddings=self.use_rotary_embeddings,
+        )
+        self.self_attn_layer_norm = BertLayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, self.ffn_embed_dim)
+        self.fc2 = nn.Linear(self.ffn_embed_dim, self.embed_dim)
+        self.final_layer_norm = BertLayerNorm(self.embed_dim)
+    def forward(
+        self, x, self_attn_mask=None, self_attn_padding_mask=None, need_head_weights=False
+    ):
+        residual = x
+        x = self.self_attn_layer_norm(x)
+        x, attn = self.self_attn(
+            query=x,
+            key=x,
+            value=x,
+            key_padding_mask=self_attn_padding_mask,
+            need_weights=True,
+            need_head_weights=need_head_weights,
+            attn_mask=self_attn_mask,
+        )
+        x = residual + x
+        residual = x
+        x = self.final_layer_norm(x)
+        x = gelu(self.fc1(x))
+        x = self.fc2(x)
+        x = residual + x
+        return x, attn
+class AxialTransformerLayer(nn.Module):
+    """Implements an Axial MSA Transformer block."""
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        max_tokens_per_msa: int = 2**14,
+    ) -> None:
+        super().__init__()
+        # Initialize parameters
+        self.embedding_dim = embedding_dim
+        self.dropout_prob = dropout
+        row_self_attention = RowSelfAttention(
+            embedding_dim,
+            num_attention_heads,
+            dropout=dropout,
+            max_tokens_per_msa=max_tokens_per_msa,
+        )
+        column_self_attention = ColumnSelfAttention(
+            embedding_dim,
+            num_attention_heads,
+            dropout=dropout,
+            max_tokens_per_msa=max_tokens_per_msa,
+        )
+        feed_forward_layer = FeedForwardNetwork(
+            embedding_dim,
+            ffn_embedding_dim,
+            activation_dropout=activation_dropout,
+            max_tokens_per_msa=max_tokens_per_msa,
+        )
+        self.row_self_attention = self.build_residual(row_self_attention)
+        self.column_self_attention = self.build_residual(column_self_attention)
+        self.feed_forward_layer = self.build_residual(feed_forward_layer)
+    def build_residual(self, layer: nn.Module):
+        return NormalizedResidualBlock(
+            layer,
+            self.embedding_dim,
+            self.dropout_prob,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+        need_head_weights: bool = False,
+    ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer implementation.
+        """
+        x, row_attn = self.row_self_attention(
+            x,
+            self_attn_mask=self_attn_mask,
+            self_attn_padding_mask=self_attn_padding_mask,
+        )
+        x, column_attn = self.column_self_attention(
+            x,
+            self_attn_mask=self_attn_mask,
+            self_attn_padding_mask=self_attn_padding_mask,
+        )
+        x = self.feed_forward_layer(x)
+        if need_head_weights:
+            return x, column_attn, row_attn
+        else:
+            return x
+class LearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    Padding ids are ignored by either offsetting based on padding_idx
+    or by setting padding_idx to None and ensuring that the appropriate
+    position ids are passed to the forward function.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        if padding_idx is not None:
+            num_embeddings_ = num_embeddings + padding_idx + 1
+        else:
+            num_embeddings_ = num_embeddings
+        super().__init__(num_embeddings_, embedding_dim, padding_idx)
+        self.max_positions = num_embeddings
+    def forward(self, input: torch.Tensor):
+        """Input is expected to be of size [bsz x seqlen]."""
+        if input.size(1) > self.max_positions:
+            raise ValueError(
+                f"Sequence length {input.size(1)} above maximum "
+                f" sequence length of {self.max_positions}"
+            )
+        mask = input.ne(self.padding_idx).int()
+        positions = (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + self.padding_idx
+        return F.embedding(
+            positions,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+class SinusoidalPositionalEmbedding(nn.Module):
+    def __init__(self, embed_dim, padding_idx, learned=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.padding_idx = padding_idx
+        self.register_buffer("_float_tensor", torch.FloatTensor(1))
+        self.weights = None
+    def forward(self, x):
+        bsz, seq_len = x.shape
+        max_pos = self.padding_idx + 1 + seq_len
+        if self.weights is None or max_pos > self.weights.size(0):
+            self.weights = self.get_embedding(max_pos)
+        self.weights = self.weights.type_as(self._float_tensor)
+        positions = self.make_positions(x)
+        return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
+    def make_positions(self, x):
+        mask = x.ne(self.padding_idx)
+        range_buf = torch.arange(x.size(1), device=x.device).expand_as(x) + self.padding_idx + 1
+        positions = range_buf.expand_as(x)
+        return positions * mask.long() + self.padding_idx * (1 - mask.long())
+    def get_embedding(self, num_embeddings):
+        half_dim = self.embed_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if self.embed_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if self.padding_idx is not None:
+            emb[self.padding_idx, :] = 0
+        return emb
+class RobertaLMHead(nn.Module):
+    """Head for masked language modeling."""
+    def __init__(self, embed_dim, output_dim, weight):
+        super().__init__()
+        self.dense = nn.Linear(embed_dim, embed_dim)
+        self.layer_norm = ESM1bLayerNorm(embed_dim)
+        self.weight = weight
+        self.bias = nn.Parameter(torch.zeros(output_dim))
+    def forward(self, features):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+        # project back to size of vocabulary with bias
+        x = F.linear(x, self.weight) + self.bias
+        return x
+class ContactPredictionHead(nn.Module):
+    """Performs symmetrization, apc, and computes a logistic regression on the output features"""
+    def __init__(
+        self,
+        in_features: int,
+        prepend_bos: bool,
+        append_eos: bool,
+        bias=True,
+        eos_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.prepend_bos = prepend_bos
+        self.append_eos = append_eos
+        if append_eos and eos_idx is None:
+            raise ValueError("Using an alphabet with eos token, but no eos token was passed in.")
+        self.eos_idx = eos_idx
+        self.regression = nn.Linear(in_features, 1, bias)
+        self.activation = nn.Sigmoid()
+    def forward(self, tokens, attentions):
+        # remove eos token attentions
+        if self.append_eos:
+            eos_mask = tokens.ne(self.eos_idx).to(attentions)
+            eos_mask = eos_mask.unsqueeze(1) * eos_mask.unsqueeze(2)
+            attentions = attentions * eos_mask[:, None, None, :, :]
+            attentions = attentions[..., :-1, :-1]
+        # remove cls token attentions
+        if self.prepend_bos:
+            attentions = attentions[..., 1:, 1:]
+        batch_size, layers, heads, seqlen, _ = attentions.size()
+        attentions = attentions.view(batch_size, layers * heads, seqlen, seqlen)
+        # features: B x C x T x T
+        attentions = attentions.to(
+            self.regression.weight.device
+        )  # attentions always float32, may need to convert to float16
+        attentions = apc(symmetrize(attentions))
+        attentions = attentions.permute(0, 2, 3, 1)
+        return self.activation(self.regression(attentions).squeeze(3))
+class NormalizedResidualBlock(nn.Module):
+    def __init__(
+        self,
+        layer: nn.Module,
+        embedding_dim: int,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.layer = layer
+        self.dropout_module = nn.Dropout(
+            dropout,
+        )
+        self.layer_norm = ESM1bLayerNorm(self.embedding_dim)
+    def forward(self, x, *args, **kwargs):
+        residual = x
+        x = self.layer_norm(x)
+        outputs = self.layer(x, *args, **kwargs)
+        if isinstance(outputs, tuple):
+            x, *out = outputs
+        else:
+            x = outputs
+            out = None
+        x = self.dropout_module(x)
+        x = residual + x
+        if out is not None:
+            return (x,) + tuple(out)
+        else:
+            return x
+class FeedForwardNetwork(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        ffn_embedding_dim: int,
+        activation_dropout: float = 0.1,
+        max_tokens_per_msa: int = 2**14,
+    ):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.ffn_embedding_dim = ffn_embedding_dim
+        self.max_tokens_per_msa = max_tokens_per_msa
+        self.activation_fn = nn.GELU()
+        self.activation_dropout_module = nn.Dropout(
+            activation_dropout,
+        )
+        self.fc1 = nn.Linear(embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, embedding_dim)
+    def forward(self, x):
+        x = self.activation_fn(self.fc1(x))
+        x = self.activation_dropout_module(x)
+        x = self.fc2(x)
+        return x

esm/esm/multihead_attention.py ADDED Viewed

	@@ -0,0 +1,508 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import Parameter
+from esm.rotary_embedding import RotaryEmbedding
+import uuid
+def utils_softmax(x, dim: int, onnx_trace: bool = False):
+    if onnx_trace:
+        return F.softmax(x.float(), dim=dim)
+    else:
+        return F.softmax(x, dim=dim, dtype=torch.float32)
+class FairseqIncrementalState(object):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.init_incremental_state()
+    def init_incremental_state(self):
+        self._incremental_state_id = str(uuid.uuid4())
+    def _get_full_incremental_state_key(self, key: str) -> str:
+        return "{}.{}".format(self._incremental_state_id, key)
+    def get_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        key: str,
+    ) -> Optional[Dict[str, Optional[Tensor]]]:
+        """Helper for getting incremental state for an nn.Module."""
+        full_key = self._get_full_incremental_state_key(key)
+        if incremental_state is None or full_key not in incremental_state:
+            return None
+        return incremental_state[full_key]
+    def set_incremental_state(
+        self,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+        key: str,
+        value: Dict[str, Optional[Tensor]],
+    ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
+        """Helper for setting incremental state for an nn.Module."""
+        if incremental_state is not None:
+            full_key = self._get_full_incremental_state_key(key)
+            incremental_state[full_key] = value
+        return incremental_state
+def with_incremental_state(cls):
+    cls.__bases__ = (FairseqIncrementalState,) + tuple(
+        b for b in cls.__bases__ if b != FairseqIncrementalState
+    )
+    return cls
+@with_incremental_state
+class MultiheadAttention(nn.Module):
+    """Multi-headed attention.
+    See "Attention Is All You Need" for more details.
+    """
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        self_attention: bool = False,
+        encoder_decoder_attention: bool = False,
+        use_rotary_embeddings: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.reset_parameters()
+        self.onnx_trace = False
+        self.rot_emb = None
+        if use_rotary_embeddings:
+            self.rot_emb = RotaryEmbedding(dim=self.head_dim)
+        self.enable_torch_version = False
+        if hasattr(F, "multi_head_attention_forward"):
+            self.enable_torch_version = True
+        else:
+            self.enable_torch_version = False
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        need_weights: bool = True,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if (
+            not self.rot_emb
+            and self.enable_torch_version
+            and not self.onnx_trace
+            and incremental_state is None
+            and not static_kv
+            # A workaround for quantization to work. Otherwise JIT compilation
+            # treats bias in linear module as method.
+            and not torch.jit.is_scripting()
+            and not need_head_weights
+        ):
+            assert key is not None and value is not None
+            return F.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                torch.empty([0]),
+                torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                self.training,
+                key_padding_mask,
+                need_weights,
+                attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj.weight,
+                k_proj_weight=self.k_proj.weight,
+                v_proj_weight=self.v_proj.weight,
+            )
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        src_len = k.size(1)
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask),
+                    ],
+                    dim=1,
+                )
+        if self.rot_emb:
+            q, k = self.rot_emb(q, k)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = MultiheadAttention.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf")
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v
+        attn_weights_float = utils_softmax(attn_weights, dim=-1, onnx_trace=self.onnx_trace)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(
+            attn_weights_float.type_as(attn_weights),
+            p=self.dropout,
+            training=self.training,
+        )
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.size(1) == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).type_as(attn).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - prev_key_padding_mask.size(1)),
+                device=prev_key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), filler.float()], dim=1
+            )
+        elif key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - key_padding_mask.size(1)),
+                device=key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat([filler.float(), key_padding_mask.float()], dim=1)
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+    @torch.jit.export
+    def reorder_incremental_state(
+        self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention and input_buffer_k.size(0) == new_order.size(
+                        0
+                    ):
+                        break
+                    input_buffer[k] = input_buffer_k.index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+    def _set_input_buffer(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+    def apply_sparse_mask(attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+                keys_to_remove.append(k)
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][dim : 2 * dim]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+                    keys_to_remove.append(prefix + "in_proj_bias")
+        for k in keys_to_remove:
+            del state_dict[k]
+        for key, value in items_to_add.items():
+            state_dict[key] = value

esm/esm/pretrained.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+import urllib
+import warnings
+from argparse import Namespace
+from pathlib import Path
+import torch
+import esm
+from esm.model.esm2 import ESM2
+def _has_regression_weights(model_name):
+    """Return whether we expect / require regression weights;
+    Right now that is all models except ESM-1v and ESM-IF"""
+    return not ("esm1v" in model_name or "esm_if" in model_name)
+def load_model_and_alphabet(model_name):
+    if model_name.endswith(".pt"):  # treat as filepath
+        return load_model_and_alphabet_local(model_name)
+    else:
+        return load_model_and_alphabet_hub(model_name)
+def load_hub_workaround(url):
+    try:
+        data = torch.hub.load_state_dict_from_url(url, progress=False, map_location="cpu")
+    except RuntimeError:
+        # Pytorch version issue - see https://github.com/pytorch/pytorch/issues/43106
+        fn = Path(url).name
+        data = torch.load(
+            f"{torch.hub.get_dir()}/checkpoints/{fn}",
+            map_location="cpu",
+        )
+    except urllib.error.HTTPError as e:
+        raise Exception(f"Could not load {url}, check if you specified a correct model name?")
+    return data
+def load_regression_hub(model_name):
+    url = f"https://dl.fbaipublicfiles.com/fair-esm/regression/{model_name}-contact-regression.pt"
+    regression_data = load_hub_workaround(url)
+    return regression_data
+def _download_model_and_regression_data(model_name):
+    url = f"https://dl.fbaipublicfiles.com/fair-esm/models/{model_name}.pt"
+    model_data = load_hub_workaround(url)
+    if _has_regression_weights(model_name):
+        regression_data = load_regression_hub(model_name)
+    else:
+        regression_data = None
+    return model_data, regression_data
+def load_model_and_alphabet_hub(model_name):
+    model_data, regression_data = _download_model_and_regression_data(model_name)
+    return load_model_and_alphabet_core(model_name, model_data, regression_data)
+def load_model_and_alphabet_local(model_location):
+    """Load from local path. The regression weights need to be co-located"""
+    model_location = Path(model_location)
+    model_data = torch.load(str(model_location), map_location="cpu")
+    model_name = model_location.stem
+    if _has_regression_weights(model_name):
+        regression_location = str(model_location.with_suffix("")) + "-contact-regression.pt"
+        regression_data = torch.load(regression_location, map_location="cpu")
+    else:
+        regression_data = None
+    return load_model_and_alphabet_core(model_name, model_data, regression_data)
+def has_emb_layer_norm_before(model_state):
+    """Determine whether layer norm needs to be applied before the encoder"""
+    return any(k.startswith("emb_layer_norm_before") for k, param in model_state.items())
+def _load_model_and_alphabet_core_v1(model_data):
+    import esm  # since esm.inverse_folding is imported below, you actually have to re-import esm here
+    alphabet = esm.Alphabet.from_architecture(model_data["args"].arch)
+    if model_data["args"].arch == "roberta_large":
+        # upgrade state dict
+        pra = lambda s: "".join(s.split("encoder_")[1:] if "encoder" in s else s)
+        prs1 = lambda s: "".join(s.split("encoder.")[1:] if "encoder" in s else s)
+        prs2 = lambda s: "".join(
+            s.split("sentence_encoder.")[1:] if "sentence_encoder" in s else s
+        )
+        model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
+        model_state = {prs1(prs2(arg[0])): arg[1] for arg in model_data["model"].items()}
+        model_state["embed_tokens.weight"][alphabet.mask_idx].zero_()  # For token drop
+        model_args["emb_layer_norm_before"] = has_emb_layer_norm_before(model_state)
+        model_type = esm.ProteinBertModel
+    elif model_data["args"].arch == "protein_bert_base":
+        # upgrade state dict
+        pra = lambda s: "".join(s.split("decoder_")[1:] if "decoder" in s else s)
+        prs = lambda s: "".join(s.split("decoder.")[1:] if "decoder" in s else s)
+        model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
+        model_state = {prs(arg[0]): arg[1] for arg in model_data["model"].items()}
+        model_type = esm.ProteinBertModel
+    elif model_data["args"].arch == "msa_transformer":
+        # upgrade state dict
+        pra = lambda s: "".join(s.split("encoder_")[1:] if "encoder" in s else s)
+        prs1 = lambda s: "".join(s.split("encoder.")[1:] if "encoder" in s else s)
+        prs2 = lambda s: "".join(
+            s.split("sentence_encoder.")[1:] if "sentence_encoder" in s else s
+        )
+        prs3 = lambda s: s.replace("row", "column") if "row" in s else s.replace("column", "row")
+        model_args = {pra(arg[0]): arg[1] for arg in vars(model_data["args"]).items()}
+        model_state = {prs1(prs2(prs3(arg[0]))): arg[1] for arg in model_data["model"].items()}
+        if model_args.get("embed_positions_msa", False):
+            emb_dim = model_state["msa_position_embedding"].size(-1)
+            model_args["embed_positions_msa_dim"] = emb_dim  # initial release, bug: emb_dim==1
+        model_type = esm.MSATransformer
+    elif "invariant_gvp" in model_data["args"].arch:
+        import esm.inverse_folding
+        model_type = esm.inverse_folding.gvp_transformer.GVPTransformerModel
+        model_args = vars(model_data["args"])  # convert Namespace -> dict
+        def update_name(s):
+            # Map the module names in checkpoints trained with internal code to
+            # the updated module names in open source code
+            s = s.replace("W_v", "embed_graph.embed_node")
+            s = s.replace("W_e", "embed_graph.embed_edge")
+            s = s.replace("embed_scores.0", "embed_confidence")
+            s = s.replace("embed_score.", "embed_graph.embed_confidence.")
+            s = s.replace("seq_logits_projection.", "")
+            s = s.replace("embed_ingraham_features", "embed_dihedrals")
+            s = s.replace("embed_gvp_in_local_frame.0", "embed_gvp_output")
+            s = s.replace("embed_features_in_local_frame.0", "embed_gvp_input_features")
+            return s
+        model_state = {
+            update_name(sname): svalue
+            for sname, svalue in model_data["model"].items()
+            if "version" not in sname
+        }
+    else:
+        raise ValueError("Unknown architecture selected")
+    model = model_type(
+        Namespace(**model_args),
+        alphabet,
+    )
+    return model, alphabet, model_state
+def _load_model_and_alphabet_core_v2(model_data):
+    def upgrade_state_dict(state_dict):
+        """Removes prefixes 'model.encoder.sentence_encoder.' and 'model.encoder.'."""
+        prefixes = ["encoder.sentence_encoder.", "encoder."]
+        pattern = re.compile("^" + "|".join(prefixes))
+        state_dict = {pattern.sub("", name): param for name, param in state_dict.items()}
+        return state_dict
+    cfg = model_data["cfg"]["model"]
+    state_dict = model_data["model"]
+    state_dict = upgrade_state_dict(state_dict)
+    alphabet = esm.data.Alphabet.from_architecture("ESM-1b")
+    model = ESM2(
+        num_layers=cfg.encoder_layers,
+        embed_dim=cfg.encoder_embed_dim,
+        attention_heads=cfg.encoder_attention_heads,
+        alphabet=alphabet,
+        token_dropout=cfg.token_dropout,
+    )
+    return model, alphabet, state_dict
+def load_model_and_alphabet_core(model_name, model_data, regression_data=None):
+    if regression_data is not None:
+        model_data["model"].update(regression_data["model"])
+    if model_name.startswith("esm2"):
+        model, alphabet, model_state = _load_model_and_alphabet_core_v2(model_data)
+    else:
+        model, alphabet, model_state = _load_model_and_alphabet_core_v1(model_data)
+    expected_keys = set(model.state_dict().keys())
+    found_keys = set(model_state.keys())
+    if regression_data is None:
+        expected_missing = {"contact_head.regression.weight", "contact_head.regression.bias"}
+        error_msgs = []
+        missing = (expected_keys - found_keys) - expected_missing
+        if missing:
+            error_msgs.append(f"Missing key(s) in state_dict: {missing}.")
+        unexpected = found_keys - expected_keys
+        if unexpected:
+            error_msgs.append(f"Unexpected key(s) in state_dict: {unexpected}.")
+        if error_msgs:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(
+                    model.__class__.__name__, "\n\t".join(error_msgs)
+                )
+            )
+        if expected_missing - found_keys:
+            warnings.warn(
+                "Regression weights not found, predicting contacts will not produce correct results."
+            )
+    model.load_state_dict(model_state, strict=regression_data is not None)
+    return model, alphabet
+def esm1_t34_670M_UR50S():
+    """34 layer transformer model with 670M params, trained on Uniref50 Sparse.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1_t34_670M_UR50S")
+def esm1_t34_670M_UR50D():
+    """34 layer transformer model with 670M params, trained on Uniref50 Dense.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1_t34_670M_UR50D")
+def esm1_t34_670M_UR100():
+    """34 layer transformer model with 670M params, trained on Uniref100.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1_t34_670M_UR100")
+def esm1_t12_85M_UR50S():
+    """12 layer transformer model with 85M params, trained on Uniref50 Sparse.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1_t12_85M_UR50S")
+def esm1_t6_43M_UR50S():
+    """6 layer transformer model with 43M params, trained on Uniref50 Sparse.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1_t6_43M_UR50S")
+def esm1b_t33_650M_UR50S():
+    """33 layer transformer model with 650M params, trained on Uniref50 Sparse.
+    This is our best performing model, which will be described in a future publication.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1b_t33_650M_UR50S")
+def esm_msa1_t12_100M_UR50S():
+    warnings.warn(
+        "This model had a minor bug in the positional embeddings, "
+        "please use ESM-MSA-1b: esm.pretrained.esm_msa1b_t12_100M_UR50S()",
+    )
+    return load_model_and_alphabet_hub("esm_msa1_t12_100M_UR50S")
+def esm_msa1b_t12_100M_UR50S():
+    return load_model_and_alphabet_hub("esm_msa1b_t12_100M_UR50S")
+def esm1v_t33_650M_UR90S():
+    """33 layer transformer model with 650M params, trained on Uniref90.
+    This is model 1 of a 5 model ensemble.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1v_t33_650M_UR90S_1")
+def esm1v_t33_650M_UR90S_1():
+    """33 layer transformer model with 650M params, trained on Uniref90.
+    This is model 1 of a 5 model ensemble.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1v_t33_650M_UR90S_1")
+def esm1v_t33_650M_UR90S_2():
+    """33 layer transformer model with 650M params, trained on Uniref90.
+    This is model 2 of a 5 model ensemble.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1v_t33_650M_UR90S_2")
+def esm1v_t33_650M_UR90S_3():
+    """33 layer transformer model with 650M params, trained on Uniref90.
+    This is model 3 of a 5 model ensemble.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1v_t33_650M_UR90S_3")
+def esm1v_t33_650M_UR90S_4():
+    """33 layer transformer model with 650M params, trained on Uniref90.
+    This is model 4 of a 5 model ensemble.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1v_t33_650M_UR90S_4")
+def esm1v_t33_650M_UR90S_5():
+    """33 layer transformer model with 650M params, trained on Uniref90.
+    This is model 5 of a 5 model ensemble.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm1v_t33_650M_UR90S_5")
+def esm_if1_gvp4_t16_142M_UR50():
+    """Inverse folding model with 142M params, with 4 GVP-GNN layers, 8
+    Transformer encoder layers, and 8 Transformer decoder layers, trained on
+    CATH structures and 12 million alphafold2 predicted structures from UniRef50
+    sequences.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm_if1_gvp4_t16_142M_UR50")
+def esm2_t6_8M_UR50D():
+    """6 layer ESM-2 model with 8M params, trained on UniRef50.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm2_t6_8M_UR50D")
+def esm2_t12_35M_UR50D():
+    """12 layer ESM-2 model with 35M params, trained on UniRef50.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm2_t12_35M_UR50D")
+def esm2_t30_150M_UR50D():
+    """30 layer ESM-2 model with 150M params, trained on UniRef50.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm2_t30_150M_UR50D")
+def esm2_t33_650M_UR50D():
+    """33 layer ESM-2 model with 650M params, trained on UniRef50.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm2_t33_650M_UR50D")
+def esm2_t36_3B_UR50D():
+    """36 layer ESM-2 model with 3B params, trained on UniRef50.
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm2_t36_3B_UR50D")
+def esm2_t48_15B_UR50D():
+    """48 layer ESM-2 model with 15B params, trained on UniRef50.
+    If you have OOM while loading this model, please refer to README
+    on how to employ FSDP and ZeRO CPU offloading
+    Returns a tuple of (Model, Alphabet).
+    """
+    return load_model_and_alphabet_hub("esm2_t48_15B_UR50D")

esm/esm/rotary_embedding.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Tuple
+import torch
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin):
+    cos = cos[:, : x.shape[-2], :]
+    sin = sin[:, : x.shape[-2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    """
+    def __init__(self, dim: int, *_, **__):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=1):
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if seq_len != self._seq_len_cached or self._cos_cached.device != x.device:
+            self._seq_len_cached = seq_len
+            t = torch.arange(x.shape[seq_dimension], device=x.device).type_as(self.inv_freq)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, :, :]
+            self._sin_cached = emb.sin()[None, :, :]
+        return self._cos_cached, self._sin_cached
+    def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(k, seq_dimension=-2)
+        return (
+            apply_rotary_pos_emb(q, self._cos_cached, self._sin_cached),
+            apply_rotary_pos_emb(k, self._cos_cached, self._sin_cached),
+        )

esm/esm/version.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+version = "1.0.2"

esm/scripts/extract.py ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/usr/bin/env python3 -u
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import pathlib
+import torch
+from esm import Alphabet, FastaBatchedDataset, ProteinBertModel, pretrained, MSATransformer
+def create_parser():
+    parser = argparse.ArgumentParser(
+        description="Extract per-token representations and model outputs for sequences in a FASTA file"  # noqa
+    )
+    parser.add_argument(
+        "model_location",
+        type=str,
+        help="PyTorch model file OR name of pretrained model to download (see README for models)",
+    )
+    parser.add_argument(
+        "fasta_file",
+        type=pathlib.Path,
+        help="FASTA file on which to extract representations",
+    )
+    parser.add_argument(
+        "output_dir",
+        type=pathlib.Path,
+        help="output directory for extracted representations",
+    )
+    parser.add_argument("--toks_per_batch", type=int, default=4096, help="maximum batch size")
+    parser.add_argument(
+        "--repr_layers",
+        type=int,
+        default=[-1],
+        nargs="+",
+        help="layers indices from which to extract representations (0 to num_layers, inclusive)",
+    )
+    parser.add_argument(
+        "--include",
+        type=str,
+        nargs="+",
+        choices=["mean", "per_tok", "bos", "contacts"],
+        help="specify which representations to return",
+        required=True,
+    )
+    parser.add_argument(
+        "--truncation_seq_length",
+        type=int,
+        default=1022,
+        help="truncate sequences longer than the given value",
+    )
+    parser.add_argument("--nogpu", action="store_true", help="Do not use GPU even if available")
+    return parser
+def main(args):
+    model, alphabet = pretrained.load_model_and_alphabet(args.model_location)
+    model.eval()
+    if isinstance(model, MSATransformer):
+        raise ValueError(
+            "This script currently does not handle models with MSA input (MSA Transformer)."
+        )
+    if torch.cuda.is_available() and not args.nogpu:
+        model = model.cuda()
+        print("Transferred model to GPU")
+    dataset = FastaBatchedDataset.from_file(args.fasta_file)
+    batches = dataset.get_batch_indices(args.toks_per_batch, extra_toks_per_seq=1)
+    data_loader = torch.utils.data.DataLoader(
+        dataset, collate_fn=alphabet.get_batch_converter(args.truncation_seq_length), batch_sampler=batches
+    )
+    print(f"Read {args.fasta_file} with {len(dataset)} sequences")
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    return_contacts = "contacts" in args.include
+    assert all(-(model.num_layers + 1) <= i <= model.num_layers for i in args.repr_layers)
+    repr_layers = [(i + model.num_layers + 1) % (model.num_layers + 1) for i in args.repr_layers]
+    with torch.no_grad():
+        for batch_idx, (labels, strs, toks) in enumerate(data_loader):
+            print(
+                f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)"
+            )
+            if torch.cuda.is_available() and not args.nogpu:
+                toks = toks.to(device="cuda", non_blocking=True)
+            out = model(toks, repr_layers=repr_layers, return_contacts=return_contacts)
+            logits = out["logits"].to(device="cpu")
+            representations = {
+                layer: t.to(device="cpu") for layer, t in out["representations"].items()
+            }
+            if return_contacts:
+                contacts = out["contacts"].to(device="cpu")
+            for i, label in enumerate(labels):
+                args.output_file = args.output_dir / f"{label}.pt"
+                args.output_file.parent.mkdir(parents=True, exist_ok=True)
+                result = {"label": label}
+                # Call clone on tensors to ensure tensors are not views into a larger representation
+                # See https://github.com/pytorch/pytorch/issues/1995
+                if "per_tok" in args.include:
+                    result["representations"] = {
+                        layer: t[i, 1 : len(strs[i]) + 1].clone()
+                        for layer, t in representations.items()
+                    }
+                if "mean" in args.include:
+                    result["mean_representations"] = {
+                        layer: t[i, 1 : len(strs[i]) + 1].mean(0).clone()
+                        for layer, t in representations.items()
+                    }
+                if "bos" in args.include:
+                    result["bos_representations"] = {
+                        layer: t[i, 0].clone() for layer, t in representations.items()
+                    }
+                if return_contacts:
+                    result["contacts"] = contacts[i, : len(strs[i]), : len(strs[i])].clone()
+                torch.save(
+                    result,
+                    args.output_file,
+                )
+if __name__ == "__main__":
+    parser = create_parser()
+    args = parser.parse_args()
+    main(args)

requirements.txt CHANGED Viewed

@@ -18,8 +18,8 @@ spyrmsd==0.5.2
 sympy==1.11.1
 pytorch==1.12.1
 numpy==1.23.1
-torchaudio=0.12.1
-torchvision=0.13.1
 rdkit-pypi==2022.3.5
 torch-scatter
 torch-sparse

 sympy==1.11.1
 pytorch==1.12.1
 numpy==1.23.1
+torchaudio==0.12.1
+torchvision==0.13.1
 rdkit-pypi==2022.3.5
 torch-scatter
 torch-sparse