TransNormerLLM-7B / srmsnorm_triton.py

Publish code

0b12900 12 months ago

5.75 kB

	# CREDITS: This comes almost as-is from the Triton layer norm tutorial
	# https://github.com/openai/triton/blob/master/python/tutorials/05-layer-norm.py
	# Copyright 2023 OpenNLPLab
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import torch
	import torch.nn.functional as F
	import triton
	import triton.language as tl


	# fmt: off
	@triton.jit
	def srms_norm_fw(X, Y, V, stride, N, eps, BLOCK_SIZE_N: tl.constexpr):
	# fmt: on
	row = tl.program_id(0)
	cols = tl.arange(0, BLOCK_SIZE_N)
	mask = cols < N

	# Move to this row
	x_ptrs = X + row * stride + cols
	x = tl.load(x_ptrs, mask=mask, other=0.0).to(tl.float32)

	x_zm = tl.where(mask, x, 0.0)

	x_var = tl.sum(x_zm * x_zm, axis=0) / N
	rstd = 1.0 / tl.sqrt(x_var + eps)

	# Normalize, optionally affine
	y = x_zm * rstd
	tl.store(V + row, rstd)

	y_ptrs = Y + row * stride + cols
	tl.store(y_ptrs, y, mask=mask)


	# Backward pass (DX + partial DW + partial DB)
	# fmt: off
	@triton.jit
	def srms_norm_bwd_dx_fused(
	DX, DY,
	X, V,
	stride, N,
	# META-parameters
	BLOCK_SIZE_N: tl.constexpr,
	):
	# fmt: on

	# position of elements processed by this program
	row = tl.program_id(0)
	cols = tl.arange(0, BLOCK_SIZE_N)
	mask = cols < N

	# offset data pointers to start at the row of interest
	x_ptrs = X + row * stride + cols
	dy_ptrs = DY + row * stride + cols

	# load data to SRAM
	x = tl.load(x_ptrs, mask=mask, other=0)
	dy = tl.load(dy_ptrs, mask=mask, other=0)
	rstd = tl.load(V + row)

	# compute dx
	xhat = x * rstd
	wdy = dy

	xhat = tl.where(mask, xhat, 0.)
	wdy = tl.where(mask, wdy, 0.)
	mean1 = tl.sum(xhat * wdy, axis=0) / N
	dx = (wdy - (xhat * mean1)) * rstd

	# write-back dx
	mask = cols < N # re-materialize the mask to save registers
	dx_ptrs = DX + row * stride + cols
	tl.store(dx_ptrs, dx, mask=mask)


	class _SrmsNorm(torch.autograd.Function):

	@staticmethod
	def forward(ctx, x, eps):
	# catch eps being too small if the tensors are fp16
	if x.dtype == torch.float16:
	eps = max(eps, 1.6e-5)

	# allocate output
	y = torch.empty_like(x)

	# reshape input data into 2D tensor
	x_arg = x.reshape(-1, x.shape[-1])
	M, N = x_arg.shape

	# allocate mean and std, they'll be used in the backward pass
	rstd = torch.empty((M, ), dtype=torch.float32, device=x.device)

	# Less than 64KB per feature: enqueue fused kernel
	MAX_FUSED_SIZE = 65536 // x.element_size()
	BLOCK_SIZE_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
	if N > BLOCK_SIZE_N:
	raise RuntimeError(
	"This layer norm doesn't support feature dim >= 64KB.")

	if not x_arg.is_contiguous() or not y.is_contiguous():
	x_arg = x_arg.contiguous()
	y = y.contiguous()

	# heuristics for number of warps.
	num_warps = min(max(BLOCK_SIZE_N // 256, 1), 16)

	# enqueue kernel
	# fmt: off
	srms_norm_fw[(M,)](
	x_arg, y, rstd,
	x_arg.stride(0),
	N,
	eps,
	num_warps=num_warps,
	BLOCK_SIZE_N=BLOCK_SIZE_N,
	)
	# fmt: on

	ctx.save_for_backward(x, rstd)
	ctx.BLOCK_SIZE_N = BLOCK_SIZE_N
	ctx.num_warps = num_warps

	return y.reshape_as(x)

	@staticmethod
	def backward(
	ctx, dy
	): # pragma: no cover # this is covered, but called directly from C++
	x, rstd = ctx.saved_tensors

	# flatten the batch dimension, if any.
	# We're interested in 'samples' x norm_dimension
	x = x.reshape(-1, x.size(-1))
	M, N = x.size()

	# heuristics for amount of parallel reduction stream for DG/DB
	GROUP_SIZE_M = 32
	if N <= 8192:
	GROUP_SIZE_M = 64
	if N <= 4096:
	GROUP_SIZE_M = 96
	if N <= 2048:
	GROUP_SIZE_M = 128
	if N <= 1024:
	GROUP_SIZE_M = 256

	if dy.dtype == torch.float32:
	GROUP_SIZE_M = GROUP_SIZE_M // 2

	# allocate output
	dy = dy.contiguous()
	dx = torch.empty_like(dy)

	# Check the tensor shapes and layouts
	# we suppose in the kernel that they have the same size and are contiguous
	assert (
	dy.numel() == x.numel()
	), "Something is wrong in the backward graph, possibly because of an inplace operation after the layernorm"

	# enqueue kernel using forward pass heuristics
	# also compute partial sums for DW and DB
	num_warps = min(max(ctx.BLOCK_SIZE_N // 256, 1), 16)

	# fmt: off
	srms_norm_bwd_dx_fused[(M,)](
	dx, dy, x,
	rstd,
	x.stride(0),
	N,
	BLOCK_SIZE_N=ctx.BLOCK_SIZE_N,
	num_warps=num_warps
	)
	# fmt: on

	dx = dx.reshape_as(dy)
	return dx, None, None


	class SimpleRMSNorm(torch.nn.Module):

	def __init__(self, dim: int, eps: float = 1e-6):
	super().__init__()
	self.eps = eps
	self.dim = dim

	def forward(self, x):
	return _SrmsNorm.apply(x, self.eps)