Upload codebase

Browse files

Files changed (12) hide show

config.json +75 -0
configuration_transnormer.py +73 -0
generation_config.json +10 -0
lightning_attention.py +540 -0
lightning_attention2.py +540 -0
modeling_transnormer.py +943 -0
norm.py +44 -0
srmsnorm_triton.py +202 -0
tokenization_transnormerllm.py +240 -0
tokenizer_config.json +10 -0
transnormer_100k.tiktoken +0 -0
utils.py +166 -0

config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "_name_or_path": "15b-50B",
+  "add_bos_token": false,
+  "architectures": [
+    "TransnormerForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_transnormer.TransnormerConfig",
+    "AutoModelForCausalLM": "modeling_transnormer.TransnormerForCausalLM"
+  },
+  "bias": false,
+  "bos_token_id": 100261,
+  "decoder_attention_heads": 40,
+  "decoder_embed_dim": 5120,
+  "decoder_layers": 42,
+  "eos_token_id": 100257,
+  "gate_dim": 16,
+  "glu_dim": 15360,
+  "hidden_dim": 5120,
+  "init_std": 0.02,
+  "linear_act_fun": "swish",
+  "linear_use_lrpe": 0,
+  "linear_use_lrpe_list": [
+    1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0
+  ],
+  "model_type": "transnormer",
+  "no_scale_embedding": false,
+  "norm_type": "simplermsnorm",
+  "pad_token_id": 100262,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.33.1",
+  "use_cache": true,
+  "vocab_size": 100280
+}

configuration_transnormer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#    Copyright 2024 OpenNLPLab
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# coding=utf-8
+""" Transnormer configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class TransnormerConfig(PretrainedConfig):
+    model_type = "transnormer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        vocab_size=64000,
+        use_cache=True,
+        init_std=0.02,
+        # model config
+        decoder_embed_dim=1024,
+        decoder_layers=24,
+        decoder_attention_heads=8,
+        no_scale_embedding=False,
+        add_bos_token=False,
+        norm_type="simplermsnorm",
+        linear_use_lrpe_list=[],
+        hidden_dim=1024,
+        linear_act_fun="silu",
+        glu_dim=2816,
+        bias=False,
+        gate_dim=16,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        # hf origin
+        self.vocab_size = vocab_size
+        self.use_cache = use_cache
+        self.init_std = init_std
+        # add
+        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.no_scale_embedding = no_scale_embedding
+        self.add_bos_token = add_bos_token
+        self.norm_type = norm_type
+        self.linear_use_lrpe_list = linear_use_lrpe_list
+        self.hidden_dim = hidden_dim
+        self.linear_act_fun = linear_act_fun
+        self.glu_dim = glu_dim
+        self.bias = bias
+        self.gate_dim = gate_dim

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 100261,
+  "do_sample": true,
+  "eos_token_id": 100257,
+  "max_new_tokens": 8192,
+  "pad_token_id": 100262,
+  "repetition_penalty": 1.03,
+  "transformers_version": "4.33.1"
+}

lightning_attention.py ADDED Viewed

	@@ -0,0 +1,540 @@

+#    Copyright 2024 OpenNLPLab
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# coding=utf-8
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    Out,
+    S,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_ve,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_oe,
+    stride_sh,
+    Z,
+    H,
+    N_CTX,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL_QK: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL_V: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    USE_DECAY: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_h = off_hz % H
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_DMODEL_QK)
+    offs_e = tl.arange(0, BLOCK_DMODEL_V)
+    # get current offset of q k v
+    off_q = (off_hz * stride_qh + offs_m[:, None] * stride_qm +
+             offs_k[None, :] * stride_qk)
+    off_k = (off_hz * stride_kh + offs_n[:, None] * stride_kn +
+             offs_k[None, :] * stride_kk)
+    off_v = (off_hz * stride_vh + offs_n[:, None] * stride_vn +
+             offs_e[None, :] * stride_ve)
+    off_o = (off_hz * stride_oh + offs_m[:, None] * stride_om +
+             offs_e[None, :] * stride_oe)
+    # Initialize pointers to Q, K, V
+    q_ptrs = Q + off_q
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    # initialize pointer to m and l
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_V], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs, mask=offs_m[:, None] < N_CTX, other=0.0)
+    # loop over k, v and update accumulator
+    lo = 0
+    # print(start_m)
+    hi = (start_m + 1) * BLOCK_M if IS_CAUSAL else N_CTX
+    for start_n in range(lo, hi, BLOCK_N):
+        # -- load k, v --
+        k = tl.load(
+            k_ptrs + start_n * stride_kn,
+            mask=(start_n + offs_n)[:, None] < N_CTX,
+            other=0.0,
+        )
+        v = tl.load(
+            v_ptrs + start_n * stride_vn,
+            mask=(start_n + offs_n)[:, None] < N_CTX,
+            other=0.0,
+        )
+        # -- compute qk ---
+        # qk = tl.dot(q, k)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # qk += tl.dot(q, k, trans_b=True)
+        qk += tl.dot(q, tl.trans(k))
+        if IS_CAUSAL:
+            index = offs_m[:, None] - (start_n + offs_n[None, :])
+            if USE_DECAY:
+                S_block_ptr = S + off_h * stride_sh
+                s = tl.load(S_block_ptr)
+                s_index = s * index
+                s_index = tl.where(s_index >= 0, -s_index, float("-inf"))
+                qk = tl.exp(s_index) * qk
+            else:
+                qk = tl.where(index >= 0, qk, 0)
+        acc += tl.dot(qk, v.to(qk.dtype))
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc.to(q.dtype), mask=offs_m[:, None] < N_CTX)
+@triton.jit
+def _bwd_kernel_kv(
+    Q,
+    K,
+    V,
+    S,
+    DO,
+    DQ,
+    DK,
+    DV,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_ve,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_oe,
+    stride_sh,
+    Z,
+    H,
+    N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL_QK: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL_V: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    USE_DECAY: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    DO += off_z * stride_oz + off_h * stride_oh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    DK += off_z * stride_kz + off_h * stride_kh
+    DV += off_z * stride_vz + off_h * stride_vh
+    # start of q
+    if CAUSAL:
+        lo = start_n * BLOCK_M
+    else:
+        lo = 0
+    # initialize row/col offsets
+    # seqlence offset
+    offs_qm = lo + tl.arange(0, BLOCK_M)
+    offs_kvn = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    # feature offset
+    offs_qkk = tl.arange(0, BLOCK_DMODEL_QK)
+    offs_ve = tl.arange(0, BLOCK_DMODEL_V)
+    # row block index
+    offs_m = tl.arange(0, BLOCK_M)
+    # initialize pointers to value-like data
+    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_qkk[None, :] * stride_qk)
+    k_ptrs = K + (offs_kvn[:, None] * stride_kn +
+                  offs_qkk[None, :] * stride_kk)
+    v_ptrs = V + (offs_kvn[:, None] * stride_vn + offs_ve[None, :] * stride_ve)
+    do_ptrs = DO + (offs_qm[:, None] * stride_om +
+                    offs_ve[None, :] * stride_oe)
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_qm +
+                    offs_qkk[None, :] * stride_qk)
+    # initialize dv amd dk
+    dv = tl.zeros([BLOCK_N, BLOCK_DMODEL_V], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_DMODEL_QK], dtype=tl.float32)
+    # k and v stay in SRAM throughout
+    k = tl.load(k_ptrs, mask=offs_kvn[:, None] < N_CTX, other=0.0)
+    v = tl.load(v_ptrs, mask=offs_kvn[:, None] < N_CTX, other=0.0)
+    # loop over rows
+    for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
+        offs_m_curr = start_m + offs_m
+        # load q, k, v, do on-chip
+        q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < N_CTX, other=0.0)
+        qk = tl.dot(q, tl.trans(k))
+        # qk = tl.dot(q, k, trans_b=True)
+        if CAUSAL:
+            index = offs_m_curr[:, None] - offs_kvn[None, :]
+            if USE_DECAY:
+                S_block_ptr = S + off_h * stride_sh
+                s = tl.load(S_block_ptr)
+                s_index = s * index
+                s_index = tl.where(s_index >= 0, -s_index, float("-inf"))
+                s = tl.exp(s_index)
+                qk = qk * s
+            else:
+                qk = tl.where(index >= 0, qk, 0)
+        p = qk
+        # compute dv
+        do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < N_CTX, other=0.0)
+        dv += tl.dot(tl.trans(p.to(do.dtype)), do)
+        dp = tl.dot(do, tl.trans(v).to(do.dtype))
+        if CAUSAL:
+            if USE_DECAY:
+                dp = dp * s
+            else:
+                dp = tl.where(index >= 0, dp, 0)
+        dk += tl.dot(tl.trans(dp.to(q.dtype)), q).to(tl.float32)
+        # increment pointers
+        q_ptrs += BLOCK_M * stride_qm
+        do_ptrs += BLOCK_M * stride_om
+    # write-back
+    dv_ptrs = DV + (offs_kvn[:, None] * stride_vn +
+                    offs_ve[None, :] * stride_ve)
+    dk_ptrs = DK + (offs_kvn[:, None] * stride_kn +
+                    offs_qkk[None, :] * stride_kk)
+    tl.store(dv_ptrs, dv, mask=offs_kvn[:, None] < N_CTX)
+    tl.store(dk_ptrs, dk, mask=offs_kvn[:, None] < N_CTX)
+@triton.jit
+def _bwd_kernel_q(
+    Q,
+    K,
+    V,
+    S,
+    DO,
+    DQ,
+    DK,
+    DV,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_ve,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_oe,
+    stride_sh,
+    Z,
+    H,
+    N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL_QK: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL_V: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    USE_DECAY: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    # offset pointers for batch/head
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    DO += off_z * stride_oz + off_h * stride_oh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    # feature offset
+    offs_qkk = tl.arange(0, BLOCK_DMODEL_QK)
+    offs_ve = tl.arange(0, BLOCK_DMODEL_V)
+    # row block index
+    offs_m = tl.arange(0, BLOCK_M)
+    # row block index
+    offs_qm = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # do
+    do_ptrs = DO + (offs_qm[:, None] * stride_om +
+                    offs_ve[None, :] * stride_oe)
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_qm +
+                    offs_qkk[None, :] * stride_qk)
+    do = tl.load(do_ptrs, mask=offs_qm[:, None] < N_CTX, other=0.0)
+    dq = tl.zeros([BLOCK_M, BLOCK_DMODEL_QK], dtype=tl.float32)
+    lo = 0
+    hi = (start_m + 1) * BLOCK_M if CAUSAL else N_CTX
+    offs_m_curr = start_m * BLOCK_M + offs_m
+    for start_n in range(0, num_block):
+        offs_kvn = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+        k_ptrs = K + (offs_kvn[:, None] * stride_kn +
+                      offs_qkk[None, :] * stride_kk)
+        v_ptrs = V + (offs_kvn[:, None] * stride_vn +
+                      offs_ve[None, :] * stride_ve)
+        # k and v stay in SRAM throughout
+        k = tl.load(k_ptrs, mask=offs_kvn[:, None] < N_CTX, other=0.0)
+        v = tl.load(v_ptrs, mask=offs_kvn[:, None] < N_CTX, other=0.0)
+        # dp = do vT
+        dp = tl.dot(do, tl.trans(v).to(do.dtype))
+        if CAUSAL:
+            index = offs_m_curr[:, None] - offs_kvn[None, :]
+            if USE_DECAY:
+                S_block_ptr = S + off_h * stride_sh
+                s = tl.load(S_block_ptr)
+                s_index = s * index
+                s_index = tl.where(s_index >= 0, -s_index, float("-inf"))
+                s = tl.exp(s_index)
+                dp = dp * s
+            else:
+                dp = tl.where(index >= 0, dp, 0)
+        # dq = dq + dp k
+        dq += tl.dot(dp.to(k.dtype), k)
+    tl.store(dq_ptrs, dq, mask=offs_qm[:, None] < N_CTX)
+class _attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, causal, s):
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        s = s.contiguous()
+        # only support for Ampere now
+        capability = torch.cuda.get_device_capability()
+        if capability[0] < 8:
+            raise RuntimeError(
+                "Lightning attention currently only supported for compute capability >= 80"
+            )
+        # shape constraints
+        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+        # right
+        o = torch.empty(
+            (q.shape[0], q.shape[1], q.shape[2], v.shape[-1]),
+            dtype=q.dtype,
+            device=q.device,
+        )
+        BLOCK_M = 128
+        BLOCK_N = 64
+        num_warps = 4 if Lk <= 64 else 8
+        num_stages = 1
+        grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1], 1)
+        use_decay = s.shape[0] > 0
+        _fwd_kernel[grid](
+            q,
+            k,
+            v,
+            o,
+            s,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            o.stride(0),
+            o.stride(1),
+            o.stride(2),
+            o.stride(3),
+            s.stride(0),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            BLOCK_M=BLOCK_M,
+            BLOCK_DMODEL_QK=Lk,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL_V=Lv,
+            IS_CAUSAL=causal,
+            USE_DECAY=use_decay,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+        ctx.save_for_backward(q, k, v, s)
+        ctx.grid = grid
+        ctx.BLOCK_M = BLOCK_M
+        ctx.BLOCK_DMODEL_QK = Lk
+        ctx.BLOCK_N = BLOCK_N
+        ctx.BLOCK_DMODEL_V = Lv
+        ctx.causal = causal
+        ctx.use_decay = use_decay
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, s = ctx.saved_tensors
+        BLOCK_M = 32
+        BLOCK_N = 32
+        num_warps = 4
+        num_stages = 1
+        do = do.contiguous()
+        dq = torch.zeros_like(q, dtype=torch.float32)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+        grid_kv = (triton.cdiv(k.shape[2],
+                               BLOCK_N), k.shape[0] * k.shape[1], 1)
+        _bwd_kernel_kv[grid_kv](
+            q,
+            k,
+            v,
+            s,
+            do,
+            dq,
+            dk,
+            dv,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            do.stride(0),
+            do.stride(1),
+            do.stride(2),
+            do.stride(3),
+            s.stride(0),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            grid_kv[0],
+            BLOCK_M=BLOCK_M,
+            BLOCK_DMODEL_QK=ctx.BLOCK_DMODEL_QK,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL_V=ctx.BLOCK_DMODEL_V,
+            CAUSAL=ctx.causal,
+            USE_DECAY=ctx.use_decay,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+        grid_q = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1], 1)
+        _bwd_kernel_q[grid_q](
+            q,
+            k,
+            v,
+            s,
+            do,
+            dq,
+            dk,
+            dv,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            do.stride(0),
+            do.stride(1),
+            do.stride(2),
+            do.stride(3),
+            s.stride(0),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            grid_q[0],
+            BLOCK_M=BLOCK_M,
+            BLOCK_DMODEL_QK=ctx.BLOCK_DMODEL_QK,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL_V=ctx.BLOCK_DMODEL_V,
+            CAUSAL=ctx.causal,
+            USE_DECAY=ctx.use_decay,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+        return dq.to(q.dtype), dk, dv, None, None
+attention = _attention.apply
+def lightning_attention(q, k, v, causal, ed):
+    d = q.shape[-1]
+    e = v.shape[-1]
+    # arr = f(d)
+    if d >= 128:
+        m = 128
+    else:
+        m = 64
+    arr = [m * i for i in range(d // m + 1)]
+    if arr[-1] != d:
+        arr.append(d)
+    n = len(arr)
+    output = 0
+    for i in range(n - 1):
+        s = arr[i]
+        e = arr[i + 1]
+        q1 = q[..., s:e]
+        k1 = k[..., s:e]
+        o = attention(q1, k1, v, causal, ed)
+        output = output + o
+    return output

lightning_attention2.py ADDED Viewed

	@@ -0,0 +1,540 @@

+#    Copyright 2024 OpenNLPLab
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# coding=utf-8
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    Out,
+    S,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_ve,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_oe,
+    stride_sh,
+    Z,
+    H,
+    N_CTX,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL_QK: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL_V: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    USE_DECAY: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_h = off_hz % H
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_DMODEL_QK)
+    offs_e = tl.arange(0, BLOCK_DMODEL_V)
+    # get current offset of q k v
+    off_q = (off_hz * stride_qh + offs_m[:, None] * stride_qm +
+             offs_k[None, :] * stride_qk)
+    off_k = (off_hz * stride_kh + offs_n[:, None] * stride_kn +
+             offs_k[None, :] * stride_kk)
+    off_v = (off_hz * stride_vh + offs_n[:, None] * stride_vn +
+             offs_e[None, :] * stride_ve)
+    off_o = (off_hz * stride_oh + offs_m[:, None] * stride_om +
+             offs_e[None, :] * stride_oe)
+    # Initialize pointers to Q, K, V
+    q_ptrs = Q + off_q
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    # initialize pointer to m and l
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_V], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs, mask=offs_m[:, None] < N_CTX, other=0.0)
+    # loop over k, v and update accumulator
+    lo = 0
+    # print(start_m)
+    hi = (start_m + 1) * BLOCK_M if IS_CAUSAL else N_CTX
+    for start_n in range(lo, hi, BLOCK_N):
+        # -- load k, v --
+        k = tl.load(
+            k_ptrs + start_n * stride_kn,
+            mask=(start_n + offs_n)[:, None] < N_CTX,
+            other=0.0,
+        )
+        v = tl.load(
+            v_ptrs + start_n * stride_vn,
+            mask=(start_n + offs_n)[:, None] < N_CTX,
+            other=0.0,
+        )
+        # -- compute qk ---
+        # qk = tl.dot(q, k)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # qk += tl.dot(q, k, trans_b=True)
+        qk += tl.dot(q, tl.trans(k))
+        if IS_CAUSAL:
+            index = offs_m[:, None] - (start_n + offs_n[None, :])
+            if USE_DECAY:
+                S_block_ptr = S + off_h * stride_sh
+                s = tl.load(S_block_ptr)
+                s_index = s * index
+                s_index = tl.where(s_index >= 0, -s_index, float("-inf"))
+                qk = tl.exp(s_index) * qk
+            else:
+                qk = tl.where(index >= 0, qk, 0)
+        acc += tl.dot(qk, v.to(qk.dtype))
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc.to(q.dtype), mask=offs_m[:, None] < N_CTX)
+@triton.jit
+def _bwd_kernel_kv(
+    Q,
+    K,
+    V,
+    S,
+    DO,
+    DQ,
+    DK,
+    DV,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_ve,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_oe,
+    stride_sh,
+    Z,
+    H,
+    N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL_QK: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL_V: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    USE_DECAY: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    DO += off_z * stride_oz + off_h * stride_oh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    DK += off_z * stride_kz + off_h * stride_kh
+    DV += off_z * stride_vz + off_h * stride_vh
+    # start of q
+    if CAUSAL:
+        lo = start_n * BLOCK_M
+    else:
+        lo = 0
+    # initialize row/col offsets
+    # seqlence offset
+    offs_qm = lo + tl.arange(0, BLOCK_M)
+    offs_kvn = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    # feature offset
+    offs_qkk = tl.arange(0, BLOCK_DMODEL_QK)
+    offs_ve = tl.arange(0, BLOCK_DMODEL_V)
+    # row block index
+    offs_m = tl.arange(0, BLOCK_M)
+    # initialize pointers to value-like data
+    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_qkk[None, :] * stride_qk)
+    k_ptrs = K + (offs_kvn[:, None] * stride_kn +
+                  offs_qkk[None, :] * stride_kk)
+    v_ptrs = V + (offs_kvn[:, None] * stride_vn + offs_ve[None, :] * stride_ve)
+    do_ptrs = DO + (offs_qm[:, None] * stride_om +
+                    offs_ve[None, :] * stride_oe)
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_qm +
+                    offs_qkk[None, :] * stride_qk)
+    # initialize dv amd dk
+    dv = tl.zeros([BLOCK_N, BLOCK_DMODEL_V], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_DMODEL_QK], dtype=tl.float32)
+    # k and v stay in SRAM throughout
+    k = tl.load(k_ptrs, mask=offs_kvn[:, None] < N_CTX, other=0.0)
+    v = tl.load(v_ptrs, mask=offs_kvn[:, None] < N_CTX, other=0.0)
+    # loop over rows
+    for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
+        offs_m_curr = start_m + offs_m
+        # load q, k, v, do on-chip
+        q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < N_CTX, other=0.0)
+        qk = tl.dot(q, tl.trans(k))
+        # qk = tl.dot(q, k, trans_b=True)
+        if CAUSAL:
+            index = offs_m_curr[:, None] - offs_kvn[None, :]
+            if USE_DECAY:
+                S_block_ptr = S + off_h * stride_sh
+                s = tl.load(S_block_ptr)
+                s_index = s * index
+                s_index = tl.where(s_index >= 0, -s_index, float("-inf"))
+                s = tl.exp(s_index)
+                qk = qk * s
+            else:
+                qk = tl.where(index >= 0, qk, 0)
+        p = qk
+        # compute dv
+        do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < N_CTX, other=0.0)
+        dv += tl.dot(tl.trans(p.to(do.dtype)), do)
+        dp = tl.dot(do, tl.trans(v).to(do.dtype))
+        if CAUSAL:
+            if USE_DECAY:
+                dp = dp * s
+            else:
+                dp = tl.where(index >= 0, dp, 0)
+        dk += tl.dot(tl.trans(dp.to(q.dtype)), q).to(tl.float32)
+        # increment pointers
+        q_ptrs += BLOCK_M * stride_qm
+        do_ptrs += BLOCK_M * stride_om
+    # write-back
+    dv_ptrs = DV + (offs_kvn[:, None] * stride_vn +
+                    offs_ve[None, :] * stride_ve)
+    dk_ptrs = DK + (offs_kvn[:, None] * stride_kn +
+                    offs_qkk[None, :] * stride_kk)
+    tl.store(dv_ptrs, dv, mask=offs_kvn[:, None] < N_CTX)
+    tl.store(dk_ptrs, dk, mask=offs_kvn[:, None] < N_CTX)
+@triton.jit
+def _bwd_kernel_q(
+    Q,
+    K,
+    V,
+    S,
+    DO,
+    DQ,
+    DK,
+    DV,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vn,
+    stride_ve,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_oe,
+    stride_sh,
+    Z,
+    H,
+    N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL_QK: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL_V: tl.constexpr,
+    CAUSAL: tl.constexpr,
+    USE_DECAY: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    # offset pointers for batch/head
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    DO += off_z * stride_oz + off_h * stride_oh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    # feature offset
+    offs_qkk = tl.arange(0, BLOCK_DMODEL_QK)
+    offs_ve = tl.arange(0, BLOCK_DMODEL_V)
+    # row block index
+    offs_m = tl.arange(0, BLOCK_M)
+    # row block index
+    offs_qm = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # do
+    do_ptrs = DO + (offs_qm[:, None] * stride_om +
+                    offs_ve[None, :] * stride_oe)
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_qm +
+                    offs_qkk[None, :] * stride_qk)
+    do = tl.load(do_ptrs, mask=offs_qm[:, None] < N_CTX, other=0.0)
+    dq = tl.zeros([BLOCK_M, BLOCK_DMODEL_QK], dtype=tl.float32)
+    lo = 0
+    hi = (start_m + 1) * BLOCK_M if CAUSAL else N_CTX
+    offs_m_curr = start_m * BLOCK_M + offs_m
+    for start_n in range(0, num_block):
+        offs_kvn = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+        k_ptrs = K + (offs_kvn[:, None] * stride_kn +
+                      offs_qkk[None, :] * stride_kk)
+        v_ptrs = V + (offs_kvn[:, None] * stride_vn +
+                      offs_ve[None, :] * stride_ve)
+        # k and v stay in SRAM throughout
+        k = tl.load(k_ptrs, mask=offs_kvn[:, None] < N_CTX, other=0.0)
+        v = tl.load(v_ptrs, mask=offs_kvn[:, None] < N_CTX, other=0.0)
+        # dp = do vT
+        dp = tl.dot(do, tl.trans(v).to(do.dtype))
+        if CAUSAL:
+            index = offs_m_curr[:, None] - offs_kvn[None, :]
+            if USE_DECAY:
+                S_block_ptr = S + off_h * stride_sh
+                s = tl.load(S_block_ptr)
+                s_index = s * index
+                s_index = tl.where(s_index >= 0, -s_index, float("-inf"))
+                s = tl.exp(s_index)
+                dp = dp * s
+            else:
+                dp = tl.where(index >= 0, dp, 0)
+        # dq = dq + dp k
+        dq += tl.dot(dp.to(k.dtype), k)
+    tl.store(dq_ptrs, dq, mask=offs_qm[:, None] < N_CTX)
+class _attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, causal, s):
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        s = s.contiguous()
+        # only support for Ampere now
+        capability = torch.cuda.get_device_capability()
+        if capability[0] < 8:
+            raise RuntimeError(
+                "Lightning attention currently only supported for compute capability >= 80"
+            )
+        # shape constraints
+        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+        # right
+        o = torch.empty(
+            (q.shape[0], q.shape[1], q.shape[2], v.shape[-1]),
+            dtype=q.dtype,
+            device=q.device,
+        )
+        BLOCK_M = 128
+        BLOCK_N = 64
+        num_warps = 4 if Lk <= 64 else 8
+        num_stages = 1
+        grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1], 1)
+        use_decay = s.shape[0] > 0
+        _fwd_kernel[grid](
+            q,
+            k,
+            v,
+            o,
+            s,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            o.stride(0),
+            o.stride(1),
+            o.stride(2),
+            o.stride(3),
+            s.stride(0),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            BLOCK_M=BLOCK_M,
+            BLOCK_DMODEL_QK=Lk,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL_V=Lv,
+            IS_CAUSAL=causal,
+            USE_DECAY=use_decay,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+        ctx.save_for_backward(q, k, v, s)
+        ctx.grid = grid
+        ctx.BLOCK_M = BLOCK_M
+        ctx.BLOCK_DMODEL_QK = Lk
+        ctx.BLOCK_N = BLOCK_N
+        ctx.BLOCK_DMODEL_V = Lv
+        ctx.causal = causal
+        ctx.use_decay = use_decay
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, s = ctx.saved_tensors
+        BLOCK_M = 32
+        BLOCK_N = 32
+        num_warps = 4
+        num_stages = 1
+        do = do.contiguous()
+        dq = torch.zeros_like(q, dtype=torch.float32)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+        grid_kv = (triton.cdiv(k.shape[2],
+                               BLOCK_N), k.shape[0] * k.shape[1], 1)
+        _bwd_kernel_kv[grid_kv](
+            q,
+            k,
+            v,
+            s,
+            do,
+            dq,
+            dk,
+            dv,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            do.stride(0),
+            do.stride(1),
+            do.stride(2),
+            do.stride(3),
+            s.stride(0),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            grid_kv[0],
+            BLOCK_M=BLOCK_M,
+            BLOCK_DMODEL_QK=ctx.BLOCK_DMODEL_QK,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL_V=ctx.BLOCK_DMODEL_V,
+            CAUSAL=ctx.causal,
+            USE_DECAY=ctx.use_decay,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+        grid_q = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1], 1)
+        _bwd_kernel_q[grid_q](
+            q,
+            k,
+            v,
+            s,
+            do,
+            dq,
+            dk,
+            dv,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            do.stride(0),
+            do.stride(1),
+            do.stride(2),
+            do.stride(3),
+            s.stride(0),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            grid_q[0],
+            BLOCK_M=BLOCK_M,
+            BLOCK_DMODEL_QK=ctx.BLOCK_DMODEL_QK,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL_V=ctx.BLOCK_DMODEL_V,
+            CAUSAL=ctx.causal,
+            USE_DECAY=ctx.use_decay,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+        return dq.to(q.dtype), dk, dv, None, None
+attention = _attention.apply
+def lightning_attention(q, k, v, causal, ed):
+    d = q.shape[-1]
+    e = v.shape[-1]
+    # arr = f(d)
+    if d >= 128:
+        m = 128
+    else:
+        m = 64
+    arr = [m * i for i in range(d // m + 1)]
+    if arr[-1] != d:
+        arr.append(d)
+    n = len(arr)
+    output = 0
+    for i in range(n - 1):
+        s = arr[i]
+        e = arr[i + 1]
+        q1 = q[..., s:e]
+        k1 = k[..., s:e]
+        o = attention(q1, k1, v, causal, ed)
+        output = output + o
+    return output

modeling_transnormer.py ADDED Viewed

	@@ -0,0 +1,943 @@

+#    Copyright 2024 OpenNLPLab
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# coding=utf-8
+""" PyTorch Transnormer model."""
+import math
+import os
+from typing import List, Optional, Tuple, Union
+from einops import rearrange
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_transnormer import TransnormerConfig
+from .norm import SimpleRMSNorm as SimpleRMSNorm_torch
+from .srmsnorm_triton import SimpleRMSNorm as SimpleRMSNorm_triton
+from .utils import (
+    get_activation_fn,
+    get_norm_fn,
+    logging_info,
+    print_module,
+    print_params,
+)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "TransnormerConfig"
+use_triton = eval(os.environ.get("use_triton", default="True"))
+debug = eval(os.environ.get("debug", default="False"))
+if use_triton:
+    try:
+        from .lightning_attention2 import lightning_attention
+        has_lightning_attention = True
+    except (ImportError, ModuleNotFoundError):
+        has_lightning_attention = False
+else:
+    has_lightning_attention = False
+if debug:
+    logger.info(f"Use triton: {use_triton}")
+    logger.info(f"Use lightning attention: {has_lightning_attention}")
+    logger.info(f"Debug mode: {debug}, {type(debug)}")
+if not has_lightning_attention:
+    def linear_attention(q, k, v, attn_mask):
+        energy = torch.einsum("... n d, ... m d -> ... n m", q, k)
+        energy = energy * attn_mask
+        output = torch.einsum("... n m, ... m d -> ... n d", energy, v)
+        return output
+########## start Transnormer
+##### Linearized Relative Positional Encoding: https://openreview.net/forum?id=xoLyps2qWc&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DTMLR%2FAuthors%23your-submissions)
+class Lrpe(nn.Module):
+    def __init__(
+        self,
+        num_heads=8,
+        embed_dim=64,
+    ):
+        super().__init__()
+        d = num_heads * embed_dim
+        self.index = torch.empty(0)
+        self.theta = nn.Parameter(
+            10000 ** (-2 / d * torch.arange(d)).reshape(num_heads, 1, -1)
+        )
+    def extra_repr(self):
+        return print_module(self)
+    def forward(self, x, offset=0):
+        # x: b, h, n, d
+        # offset: for k, v cache
+        n = x.shape[-2]
+        if self.index.shape[0] < n:
+            self.index = torch.arange(n).reshape(1, -1, 1).to(x)
+        index = self.index[:, :n] + offset
+        theta = self.theta * index
+        x = torch.concat([x * torch.cos(theta), x * torch.sin(theta)], dim=-1)
+        return x
+class GLU(nn.Module):
+    def __init__(self, d1, d2, bias=False):
+        super().__init__()
+        if debug:
+            # get local varables
+            params = locals()
+            # print params
+            print_params(**params)
+        self.l1 = nn.Linear(d1, d2, bias=bias)
+        self.l2 = nn.Linear(d1, d2, bias=bias)
+        self.l3 = nn.Linear(d2, d1, bias=bias)
+    def forward(self, x):
+        o1 = self.l1(x)
+        o2 = self.l2(x)
+        output = o1 * o2
+        output = self.l3(output)
+        return output
+class NormLinearAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        hidden_dim,
+        num_heads,
+        gate_dim=16,
+        linear_act_fun="silu",
+        norm_type="simplermsnorm",
+        linear_use_lrpe=False,
+        bias=False,
+    ):
+        super().__init__()
+        if debug:
+            # get local varables
+            params = locals()
+            # print params
+            print_params(**params)
+        self.out_proj = nn.Linear(hidden_dim, embed_dim, bias=bias)
+        self.act = get_activation_fn(linear_act_fun)
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+        self.head_dim = self.embed_dim // self.num_heads
+        self.norm = get_norm_fn(norm_type)(hidden_dim)
+        self.linear_use_lrpe = linear_use_lrpe
+        if self.linear_use_lrpe:
+            self.lrpe = Lrpe(
+                num_heads=self.num_heads,
+                embed_dim=self.head_dim,
+            )
+        self.qkv_proj = nn.Linear(embed_dim, 3 * hidden_dim, bias=bias)
+        self.output_gate =  nn.Sequential(
+            nn.Linear(embed_dim, gate_dim, bias=bias),
+            nn.Linear(gate_dim, hidden_dim, bias=bias),
+        )
+        # for inference only
+        self.offset = 0
+    def forward(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+        attn_padding_mask: Optional[torch.Tensor] = None,  # (b, m)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,
+    ):
+        do_eval = eval(os.environ.get("do_eval", default="False"))
+        if (not self.training) and (not do_eval):
+            return self.inference(
+                x,
+                attn_mask,
+                attn_padding_mask,
+                output_attentions,
+                past_key_value,
+                use_cache,
+                slope_rate,
+            )
+        # x: b n d
+        b, n, d = x.shape
+        # linear map
+        qkv = self.act(self.qkv_proj(x))
+        q, k, v = qkv.split([d, d, d], dim=-1)
+        # reshape
+        q, k, v = map(
+            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads), [q, k, v]
+        )
+        q_offset = 0
+        # lrpe relys on position, get cache first
+        if past_key_value is not None:
+            # reuse k, v, for evaluation only
+            k = torch.cat([past_key_value[0], k], dim=-2)
+            v = torch.cat([past_key_value[1], v], dim=-2)
+            q_offset = past_key_value[0].shape[-2]
+        past_key_value = (k, v) if use_cache else None
+        # lrpe
+        if self.linear_use_lrpe:
+            q = self.lrpe(q, offset=q_offset)
+            k = self.lrpe(k)
+        if attn_padding_mask is not None:
+            v = v.masked_fill(
+                (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(torch.bool), 0
+            )
+        if not has_lightning_attention:
+            if attn_mask == None:
+                attn_mask = (torch.tril(torch.ones(n, n))).to(q)
+            if slope_rate != None:
+                attn_mask = torch.exp(slope_rate * attn_mask)
+            output = linear_attention(q, k, v, attn_mask)
+        else:
+            output = lightning_attention(
+                q, k, v, True, slope_rate.squeeze(-1).squeeze(-1)
+            )
+        # reshape
+        output = rearrange(output, "b h n d -> b n (h d)")
+        # normalize
+        output = self.norm(output)
+        # gate
+        output = F.sigmoid(self.output_gate(x)) * output
+        # outproj
+        output = self.out_proj(output)
+        if not output_attentions:
+            attn_weights = None
+        else:
+            attn_weights = torch.einsum("... n d, ... m d -> ... n m", q, k)
+        return output, attn_weights, past_key_value
+    def inference(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+        attn_padding_mask: Optional[torch.Tensor] = None,  # (b, m)
+        output_attentions: bool = False,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: bool = False,
+        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
+    ):
+        # x: b n d
+        b, n, d = x.shape
+        # linear map
+        qkv = self.act(self.qkv_proj(x))
+        q, k, v = qkv.split([d, d, d], dim=-1)
+        # reshape
+        q, k, v = map(
+            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads), [q, k, v]
+        )
+        # rpe
+        if self.linear_use_lrpe:
+            q = self.lrpe(q, offset=self.offset)
+            k = self.lrpe(k)
+        if past_key_value == None:
+            self.offset = q.shape[-2]
+        else:
+            self.offset += 1
+        ratio = torch.exp(-slope_rate)
+        # only use for the first time
+        if past_key_value == None:
+            if attn_mask == None:
+                attn_mask = (torch.tril(torch.ones(n, n))).to(q)
+            if slope_rate != None:
+                attn_mask = torch.exp(slope_rate * attn_mask)
+            if attn_padding_mask is not None:
+                attn_mask = attn_mask.masked_fill(
+                    (1 - attn_padding_mask).unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    0,
+                )
+            energy = torch.einsum("... n d, ... m d -> ... n m", q, k)
+            if attn_mask != None:
+                energy = energy * attn_mask
+            output = torch.einsum("... n m, ... m d -> ... n d", energy, v)
+            eval_and_not_generate = eval(
+                os.environ.get("eval_and_not_generate", default="False")
+            )
+            if eval_and_not_generate:
+                kv = None
+            else:
+                # b, h, n, e, d
+                kv_outproduct = torch.einsum("... n e, ... n d -> ... n e d", k, v)
+                # 1, 1, n, 1, 1
+                index = torch.arange(n - 1, -1, -1).reshape(1, 1, -1, 1, 1).to(x)
+                # (h, 1, 1) -> (1, h, 1, 1, 1); (1, h, 1, 1, 1), (1, 1, n, 1, 1) -> (1, h, n, 1, 1)
+                decay = ratio.unsqueeze(0).unsqueeze(-1) ** index
+                kv_outproduct_with_decay = kv_outproduct * decay
+                kv = torch.sum(kv_outproduct_with_decay, dim=-3)
+        else:
+            kv = past_key_value
+            output = []
+            for i in range(n):
+                kv = ratio * kv + torch.einsum(
+                    "... n d, ... n e -> ... d e",
+                    k[:, :, i : i + 1],
+                    v[:, :, i : i + 1],
+                )
+                qkv = torch.einsum(
+                    "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv
+                )
+                output.append(qkv)
+            output = torch.concat(output, dim=-2)
+        # reshape
+        output = rearrange(output, "b h n d -> b n (h d)")
+        # normalize
+        output = self.norm(output)
+        # gate
+        output = F.sigmoid(self.output_gate(x)) * output
+        # outproj
+        output = self.out_proj(output)
+        attn_weights = None
+        return output, attn_weights, kv
+class TransnormerDecoderLayer(nn.Module):
+    def __init__(self, config: TransnormerConfig):
+        super().__init__()
+        self.embed_dim = config.decoder_embed_dim
+        ##### normalize
+        norm_type = config.norm_type
+        if debug:
+            logging_info(f"Decoder Norm Type: {norm_type}")
+        self.token_norm = get_norm_fn(norm_type)(self.embed_dim)
+        self.channel_norm = get_norm_fn(norm_type)(self.embed_dim)
+        ##### token mixer
+        self.token_mixer = self.build_token_mixer(
+            self.embed_dim,
+            config,
+        )
+        ##### channel mixer
+        self.glu_dim = config.glu_dim
+        if self.glu_dim == -1:
+            self.glu_dim = self.embed_dim
+        bias = config.bias
+        self.channel_mixer = GLU(self.embed_dim, self.glu_dim, bias)
+    def build_token_mixer(self, embed_dim, config):
+        return NormLinearAttention(
+            embed_dim=embed_dim,
+            hidden_dim=config.hidden_dim,
+            num_heads=config.decoder_attention_heads,
+            gate_dim=config.gate_dim,
+            linear_act_fun=config.linear_act_fun,
+            norm_type=config.norm_type,
+            linear_use_lrpe=config.linear_use_lrpe,
+            bias=config.bias,
+        )
+    def residual_connection(self, x, residual):
+        return residual + x
+    def forward(
+        self,
+        x,
+        attn_mask: Optional[torch.Tensor] = None,
+        attn_padding_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
+    ):
+        residual = x
+        x = self.token_norm(x)
+        x, self_attn_weights, present_key_value = self.token_mixer(
+            x=x,
+            attn_mask=attn_mask,
+            attn_padding_mask=attn_padding_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            slope_rate=slope_rate,
+        )
+        x = self.residual_connection(x, residual)
+        residual = x
+        x = self.channel_norm(x)
+        x = self.channel_mixer(x)
+        x = self.residual_connection(x, residual)
+        outputs = (x,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+TRANSNORMER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`TransnormerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    TRANSNORMER_START_DOCSTRING,
+)
+class TransnormerPreTrainedModel(PreTrainedModel):
+    config_class = TransnormerConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["TransnormerDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, TransnormerModel):
+            module.gradient_checkpointing = value
+TRANSNORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attn_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attn_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    TRANSNORMER_START_DOCSTRING,
+)
+class TransnormerModel(TransnormerPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`TransnormerDecoderLayer`]
+    Args:
+        config: TransnormerConfig
+    """
+    def __init__(self, config: TransnormerConfig):
+        super().__init__(config)
+        # hf origin
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.gradient_checkpointing = False
+        # mask
+        self._linear_attn_mask = torch.empty(0)
+        # config
+        self.linear_use_lrpe_list = config.linear_use_lrpe_list
+        self.num_layers = config.decoder_layers
+        # h, 1, 1
+        self.slopes = self._build_slope_tensor(config.decoder_attention_heads)
+        # params
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.decoder_embed_dim, self.padding_idx
+        )
+        self.layers = nn.ModuleList([])
+        for i in range(config.decoder_layers):
+            if len(self.linear_use_lrpe_list) > 0:
+                config.linear_use_lrpe = self.linear_use_lrpe_list[i]
+            self.layers.append(TransnormerDecoderLayer(config))
+        self.final_norm = get_norm_fn(config.norm_type)(config.decoder_embed_dim)
+        self.embed_dim = config.decoder_embed_dim
+        self.embed_scale = (
+            1.0 if config.no_scale_embedding else math.sqrt(self.embed_dim)
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @staticmethod
+    def _build_slope_tensor(n_attention_heads: int):
+        def get_slopes(n):
+            def get_slopes_power_of_2(n):
+                start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n)]
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(
+                    n
+                )  # In the paper, we only train models that have 2^a heads for some a. This function has
+            else:  # some good properties that only occur when the input is a power of 2. To maintain that even
+                closest_power_of_2 = 2 ** math.floor(
+                    math.log2(n)
+                )  # when the number of heads is not a power of 2, we use this workaround.
+                return (
+                    get_slopes_power_of_2(closest_power_of_2)
+                    + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+                )
+        # h, 1, 1
+        slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
+            n_attention_heads, 1, 1
+        )
+        return slopes
+    def extra_repr(self):
+        return print_module(self)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _prepare_decoder_linear_attn_mask(
+        self, input_shape, inputs_embeds, past_key_values_length
+    ):
+        bsz, tgt_len = input_shape
+        src_len = tgt_len + past_key_values_length
+        def power_log(x):
+            return 2 ** (math.ceil(math.log(x, 2)))
+        n = power_log(max(tgt_len, src_len))
+        if self._linear_attn_mask.shape[-1] < n:
+            def get_mask(n):
+                mask = torch.triu(torch.zeros(n, n).float().fill_(float("-inf")), 1)
+                # no slope version
+                # -n, ..., -2, -1, 0
+                for i in range(n):
+                    x = torch.arange(i + 1)
+                    y = x
+                    mask[i, : i + 1] = -torch.flip(y, [0])
+                return mask
+            arr = []
+            for slope in self.slopes:
+                arr.append(get_mask(n))
+            self._linear_attn_mask = torch.stack(arr, dim=0).to(inputs_embeds)
+        linear_attn_mask = self._linear_attn_mask[:, -tgt_len:, -src_len:]
+        num_heads = linear_attn_mask.shape[0]
+        return linear_attn_mask[None, :, :, :].expand(bsz, num_heads, tgt_len, src_len)
+    @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attn_padding_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[-2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if inputs_embeds is None:
+            # !!! use embed_scale
+            inputs_embeds = self.embed_scale * self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        ##### norm linear layers
+        linear_attn_padding_mask = attn_padding_mask
+        linear_attn_mask = self._prepare_decoder_linear_attn_mask(
+            (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        slope_rates = [self.slopes.to(input_ids.device) for _ in range(self.num_layers)]
+        for idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            slope_rate = slope_rates[idx]
+            slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
+            mask = linear_attn_mask
+            layer_outputs = layer(
+                hidden_states,
+                attn_mask=mask,
+                attn_padding_mask=linear_attn_padding_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                slope_rate=slope_rate,
+            )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+            # if idx == 0:
+            #     break
+        hidden_states = self.final_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class TransnormerForCausalLM(TransnormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = TransnormerModel(config)
+        if debug:
+            logging_info(self.model)
+        # the lm_head weight is automatically tied to the embed tokens weight
+        self.lm_head = nn.Linear(
+            config.decoder_embed_dim, config.vocab_size, bias=False
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, TransnormerForCausalLM
+        >>> model = TransnormerForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attn_padding_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past

norm.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#    Copyright 2024 OpenNLPLab
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# coding=utf-8
+import logging
+import os
+import sys
+import torch
+from torch import nn
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("srmsnorm")
+class SimpleRMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output

srmsnorm_triton.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# CREDITS: This comes almost as-is from the Triton layer norm tutorial
+# https://github.com/openai/triton/blob/master/python/tutorials/05-layer-norm.py
+#    Copyright 2024 OpenNLPLab
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# coding=utf-8
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+# fmt: off
+@triton.jit
+def srms_norm_fw(X, Y, V, stride, N, eps, BLOCK_SIZE_N: tl.constexpr):
+    # fmt: on
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_SIZE_N)
+    mask = cols < N
+    # Move to this row
+    x_ptrs = X + row * stride + cols
+    x = tl.load(x_ptrs, mask=mask, other=0.0).to(tl.float32)
+    x_zm = tl.where(mask, x, 0.0)
+    x_var = tl.sum(x_zm * x_zm, axis=0) / N
+    rstd = 1.0 / tl.sqrt(x_var + eps)
+    # Normalize, optionally affine
+    y = x_zm * rstd
+    tl.store(V + row, rstd)
+    y_ptrs = Y + row * stride + cols
+    tl.store(y_ptrs, y, mask=mask)
+# Backward pass (DX + partial DW + partial DB)
+# fmt: off
+@triton.jit
+def srms_norm_bwd_dx_fused(
+    DX, DY,
+    X, V,
+    stride, N,
+    # META-parameters
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    # fmt: on
+    # position of elements processed by this program
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_SIZE_N)
+    mask = cols < N
+    # offset data pointers to start at the row of interest
+    x_ptrs = X + row * stride + cols
+    dy_ptrs = DY + row * stride + cols
+    # load data to SRAM
+    x = tl.load(x_ptrs, mask=mask, other=0)
+    dy = tl.load(dy_ptrs, mask=mask, other=0)
+    rstd = tl.load(V + row)
+    # compute dx
+    xhat = x * rstd
+    wdy = dy
+    xhat = tl.where(mask, xhat, 0.)
+    wdy = tl.where(mask, wdy, 0.)
+    mean1 = tl.sum(xhat * wdy, axis=0) / N
+    dx = (wdy - (xhat * mean1)) * rstd
+    # write-back dx
+    mask = cols < N  # re-materialize the mask to save registers
+    dx_ptrs = DX + row * stride + cols
+    tl.store(dx_ptrs, dx, mask=mask)
+class _SrmsNorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, eps):
+        # catch eps being too small if the tensors are fp16
+        if x.dtype == torch.float16:
+            eps = max(eps, 1.6e-5)
+        # allocate output
+        y = torch.empty_like(x)
+        # reshape input data into 2D tensor
+        x_arg = x.reshape(-1, x.shape[-1])
+        M, N = x_arg.shape
+        # allocate mean and std, they'll be used in the backward pass
+        rstd = torch.empty((M, ), dtype=torch.float32, device=x.device)
+        # Less than 64KB per feature: enqueue fused kernel
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_SIZE_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+        if N > BLOCK_SIZE_N:
+            raise RuntimeError(
+                "This layer norm doesn't support feature dim >= 64KB.")
+        if not x_arg.is_contiguous() or not y.is_contiguous():
+            x_arg = x_arg.contiguous()
+            y = y.contiguous()
+        # heuristics for number of warps.
+        num_warps = min(max(BLOCK_SIZE_N // 256, 1), 16)
+        # enqueue kernel
+        # fmt: off
+        srms_norm_fw[(M,)](
+            x_arg, y, rstd,
+            x_arg.stride(0),
+            N,
+            eps,
+            num_warps=num_warps,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+        )
+        # fmt: on
+        ctx.save_for_backward(x, rstd)
+        ctx.BLOCK_SIZE_N = BLOCK_SIZE_N
+        ctx.num_warps = num_warps
+        return y.reshape_as(x)
+    @staticmethod
+    def backward(
+        ctx, dy
+    ):  # pragma: no cover  # this is covered, but called directly from C++
+        x, rstd = ctx.saved_tensors
+        # flatten the batch dimension, if any.
+        # We're interested in 'samples' x norm_dimension
+        x = x.reshape(-1, x.size(-1))
+        M, N = x.size()
+        # heuristics for amount of parallel reduction stream for DG/DB
+        GROUP_SIZE_M = 32
+        if N <= 8192:
+            GROUP_SIZE_M = 64
+        if N <= 4096:
+            GROUP_SIZE_M = 96
+        if N <= 2048:
+            GROUP_SIZE_M = 128
+        if N <= 1024:
+            GROUP_SIZE_M = 256
+        if dy.dtype == torch.float32:
+            GROUP_SIZE_M = GROUP_SIZE_M // 2
+        # allocate output
+        dy = dy.contiguous()
+        dx = torch.empty_like(dy)
+        # Check the tensor shapes and layouts
+        # we suppose in the kernel that they have the same size and are contiguous
+        assert (
+            dy.numel() == x.numel()
+        ), "Something is wrong in the backward graph, possibly because of an inplace operation after the layernorm"
+        # enqueue kernel using forward pass heuristics
+        # also compute partial sums for DW and DB
+        num_warps = min(max(ctx.BLOCK_SIZE_N // 256, 1), 16)
+        # fmt: off
+        srms_norm_bwd_dx_fused[(M,)](
+            dx, dy, x,
+            rstd,
+            x.stride(0),
+            N,
+            BLOCK_SIZE_N=ctx.BLOCK_SIZE_N,
+            num_warps=num_warps
+        )
+        # fmt: on
+        dx = dx.reshape_as(dy)
+        return dx, None, None
+class SimpleRMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.dim = dim
+    def forward(self, x):
+        return _SrmsNorm.apply(x, self.eps)

tokenization_transnormerllm.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# CREDITS: tiktoken @openai
+# https://github.com/openai/tiktoken
+#
+#    Copyright 2024 OpenNLPLab
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# coding=utf-8
+import base64
+import logging
+import os
+from typing import Collection, Dict, List, Set, Tuple, Union
+import unicodedata
+import tiktoken
+from transformers import AddedToken, AutoTokenizer, PreTrainedTokenizer
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "transnormer_100k.tiktoken"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+SPECIAL_TOKENS_DICT = {'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|endofprompt|>': 100276, '<|J2PM|>': 100256, '<s>': 100261, '<pad>': 100262, '<unk>': 100263, '<mask>': 100264}
+SPECIAL_TOKENS_SET = set(SPECIAL_TOKENS_DICT.keys())
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+class GPT4Tokenizer(PreTrainedTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        extra_vocab_file=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
+        self.special_tokens = SPECIAL_TOKENS_DICT
+        enc = tiktoken.Encoding(
+            "transnormer_100k",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.pad_token_id = 100262
+        self.bos_token_id = 100261
+        self.eos_token_id = self.eod_id
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "transnormer_100k",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS_SET:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "transnormer_100k.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_max_length": 65536,
+  "tokenizer_class": "GPT4Tokenizer",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_transnormerllm.GPT4Tokenizer",
+      null
+    ]
+  }
+}

transnormer_100k.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#    Copyright 2024 OpenNLPLab
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# coding=utf-8
+import logging
+import os
+import sys
+import torch
+from torch import nn
+import torch.distributed as dist
+import torch.nn.functional as F
+from .norm import SimpleRMSNorm as SimpleRMSNormTorch
+from .srmsnorm_triton import SimpleRMSNorm as SimpleRMSNormTriton
+use_triton = eval(os.environ.get("use_triton", default="True"))
+debug = eval(os.environ.get("debug", default="False"))
+if use_triton:
+    SimpleRMSNorm = SimpleRMSNormTriton
+else:
+    SimpleRMSNorm = SimpleRMSNormTorch
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("print_config")
+BASE_DIM = 256
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def logging_info(string):
+    if is_main_process():
+        logger.info(string)
+def print_params(**kwargs):
+    if is_main_process():
+        logger.info(f"start print config of {kwargs['__class__']}")
+        for key in kwargs:
+            if key in ["__class__", "self"]:
+                continue
+            logger.info(f"{key}: {kwargs[key]}")
+        logger.info(f"end print config of {kwargs['__class__']}")
+def print_config(config):
+    if is_main_process():
+        logger.info(f"start print config of {config['__class__']}")
+        for key in config:
+            if key in ["__class__", "self"]:
+                continue
+            logger.info(f"{key}: {config[key]}")
+        logger.info(f"end print config of {config['__class__']}")
+def print_module(module):
+    named_modules = set()
+    for p in module.named_modules():
+        named_modules.update([p[0]])
+    named_modules = list(named_modules)
+    string_repr = ""
+    for p in module.named_parameters():
+        name = p[0].split(".")[0]
+        if name not in named_modules:
+            string_repr = (string_repr + "(" + name + "): " + "Tensor(" +
+                           str(tuple(p[1].shape)) + ", requires_grad=" +
+                           str(p[1].requires_grad) + ")\n")
+    return string_repr.rstrip("\n")
+def get_activation_fn(activation):
+    if debug:
+        logger.info(f"activation: {activation}")
+    if activation == "gelu":
+        return F.gelu
+    elif activation == "relu":
+        return F.relu
+    elif activation == "elu":
+        return F.elu
+    elif activation == "sigmoid":
+        return F.sigmoid
+    elif activation == "exp":
+        def f(x):
+            with torch.no_grad():
+                x_max = torch.max(x, dim=-1, keepdims=True).values
+            y = torch.exp(x - x_max)
+            return y
+        return f
+    elif activation == "leak":
+        return F.leaky_relu
+    elif activation == "1+elu":
+        def f(x):
+            return 1 + F.elu(x)
+        return f
+    elif activation == "2+elu":
+        def f(x):
+            return 2 + F.elu(x)
+        return f
+    elif activation == "silu" or activation == "swish":
+        return F.silu
+    elif activation == "sine":
+        return torch.sin
+    else:
+        logger.info(
+            f"activation: does not support {activation}, use Identity!!!")
+        return lambda x: x
+def get_norm_fn(norm_type):
+    if norm_type == "simplermsnorm":
+        return SimpleRMSNorm
+    else:
+        return nn.LayerNorm
+def convert_to_multiple_of_base(x):
+    return BASE_DIM * ((x + BASE_DIM - 1) // BASE_DIM)