Upload 8 files

773deed verified 6 months ago

30.3 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import math
	import inspect
	from typing import Callable, Dict, List, Optional, Set, Tuple, Union

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn import TransformerEncoder, TransformerEncoderLayer
	from fairseq import utils
	from fairseq.models.transformer import *
	from fairseq.incremental_decoding_utils import with_incremental_state
	from fairseq.modules.quant_noise import quant_noise
	from transformers.models.roberta.modeling_roberta import (
	RobertaEncoder,
	RobertaConfig,
	RobertaModel,
	RobertaLMHead,
	RobertaForMaskedLM,
	RobertaLayer
	)

	# from .multihead_linear_attention import MultiheadLinearAttention


	class LinformerTransformerEncoderLayer(RobertaLayer):
	"""
	Implements a Linformer Encoder Layer used in BERT/XLM style pre-trained
	models.
	"""

	def __init__(self, config, shared_compress_layer):
	# wrap in a list so it's not automatically registered by PyTorch
	self.shared_compress_layer = [shared_compress_layer]
	d_model=config.embed_dim
	nhead=config.num_heads
	dim_feedforward=config.dim_feedforward
	dropout=config.dropout
	activation=config.activation
	layer_norm_eps=config.layer_norm_eps

	super().__init__(config)
	self.attention = self.build_self_attention(config.embed_dim, config)
	self.attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
	self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
	self.output = RobertaOutput(config)

	def build_self_attention(self, embed_dim, args):

	attn = MultiheadLinearAttention(
	embed_dim,
	args.encoder_attention_heads,
	dropout=args.dropout,
	self_attention=True,
	q_noise=args.quant_noise_pq,
	qn_block_size=args.quant_noise_pq_block_size,
	compressed=args.compressed,
	max_seq_len=args.max_positions,
	shared_kv_compressed=args.shared_kv_compressed,
	shared_compress_layer=self.shared_compress_layer[0],
	freeze_compress=args.freeze_compress,
	)
	return attn

	def feed_forward_chunk(self, attention_output):
	residual = attention_output
	x = self.intermediate(attention_output)
	layer_output = self.output(x, residual)
	return layer_output

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor]:

	residual = hidden_states

	if self.attn_layer_norm is not None:
	hidden_states = self.attn_layer_norm(hidden_states)

	# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
	self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
	self_attention_outputs = self.attention(
	hidden_states,
	attention_mask,
	head_mask,
	output_attentions=output_attentions,
	past_key_value=self_attn_past_key_value,
	)
	attention_output = self_attention_outputs[0]

	# if decoder, the last output is tuple of self-attn cache
	if self.is_decoder:
	outputs = self_attention_outputs[1:-1]
	present_key_value = self_attention_outputs[-1]
	else:
	outputs = self_attention_outputs[1:] # add self attentions if we output attention weights

	cross_attn_present_key_value = None
	if self.is_decoder and encoder_hidden_states is not None:
	if not hasattr(self, "crossattention"):
	raise ValueError(
	f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
	" by setting `config.add_cross_attention=True`"
	)

	# cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
	cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
	cross_attention_outputs = self.crossattention(
	attention_output,
	attention_mask,
	head_mask,
	encoder_hidden_states,
	encoder_attention_mask,
	cross_attn_past_key_value,
	output_attentions,
	)
	attention_output = cross_attention_outputs[0]
	outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights

	# add cross-attn cache to positions 3,4 of present_key_value tuple
	cross_attn_present_key_value = cross_attention_outputs[-1]
	present_key_value = present_key_value + cross_attn_present_key_value

	attention_output = attention_output + residual
	residual = attention_output
	attention_output = self.final_layer_norm(attention_output)
	layer_output = apply_chunking_to_forward(
	self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
	)
	layer_output = layer_output + residual

	outputs = (layer_output,) + outputs

	# if decoder, return the attn key/values as the last output
	if self.is_decoder:
	outputs = outputs + (present_key_value,)

	return outputs


	class RobertaOutput(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
	hidden_states = self.dense(hidden_states)
	return hidden_states


	class LinformerTransformerEncoder(RobertaEncoder):
	"""
	Implementation for a Bi-directional Linformer based Sentence Encoder used
	in BERT/XLM style pre-trained models.

	This first computes the token embedding using the token embedding matrix,
	position embeddings (if specified) and segment embeddings
	(if specified). After applying the specified number of
	LinformerEncoderLayers, it outputs all the internal states of the
	encoder as well as the final representation associated with the first
	token (usually CLS token).

	Input:
	- tokens: B x T matrix representing sentences
	- segment_labels: B x T matrix representing segment label for tokens

	Output:
	- a tuple of the following:
	- a list of internal model states used to compute the
	predictions where each tensor has shape T x B x C
	- sentence representation associated with first input token
	in format B x C.
	"""

	def __init__(self, config,**kwargs):
	compress_layer = None
	if config.shared_layer_kv_compressed == 1 and compress_layer is None:
	compress_layer = nn.Linear(
	config.max_positions,
	config.max_positions // config.compressed
	)
	# intialize parameters for compressed layer
	nn.init.xavier_uniform_(compress_layer.weight, gain=1 / math.sqrt(2))
	if config.freeze_compress == 1:
	compress_layer.weight.requires_grad = False
	compress_layer = compress_layer
	#encoder_layer = LinformerTransformerEncoderLayer(config, compress_layer)

	super().__init__(config)

	self.layer = nn.ModuleList([LinformerTransformerEncoderLayer(config, compress_layer) for _ in range(config.num_layers)])
	self.compress_layer = compress_layer
	self.layer_norm = nn.LayerNorm(config.embed_dim)


	@with_incremental_state
	class MultiheadLinearAttention(nn.Module):
	def __init__(
	self,
	embed_dim,
	num_heads,
	kdim=None,
	vdim=None,
	dropout=0.0,
	bias=True,
	add_bias_kv=False,
	add_zero_attn=False,
	self_attention=False,
	encoder_decoder_attention=False,
	q_noise=0.0,
	qn_block_size=8,
	compressed=1,
	max_seq_len=256,
	shared_kv_compressed=0,
	shared_compress_layer=None,
	freeze_compress=0,
	):
	super().__init__()
	self.embed_dim = embed_dim
	self.kdim = kdim if kdim is not None else embed_dim
	self.vdim = vdim if vdim is not None else embed_dim
	self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
	self.num_heads = num_heads
	self.dropout = dropout
	self.head_dim = embed_dim // num_heads
	assert (
	self.head_dim * num_heads == self.embed_dim
	), "embed_dim must be divisible by num_heads"
	self.scaling = self.head_dim ** -0.5

	self.self_attention = self_attention
	self.encoder_decoder_attention = encoder_decoder_attention
	assert not self.self_attention or self.qkv_same_dim, (
	"Self-attention requires query, key and " "value to be of the same size"
	)

	self.k_proj = quant_noise(
	nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size
	)
	self.v_proj = quant_noise(
	nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
	)
	self.q_proj = quant_noise(
	nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
	)

	# used for compress sequence to subsequence
	if shared_compress_layer is None:
	self.compress_seq_len = max_seq_len // compressed
	self.compress_k = nn.Linear(max_seq_len, self.compress_seq_len, bias=False)
	if shared_kv_compressed == 0:
	self.compress_v = nn.Linear(
	max_seq_len, self.compress_seq_len, bias=False
	)
	self.layerwise_sharing = False
	else:
	self.compress_k = shared_compress_layer
	if shared_kv_compressed == 0:
	self.compress_v = shared_compress_layer
	self.layerwise_sharing = True
	self.shared_kv_compressed = shared_kv_compressed

	self.out_proj = quant_noise(
	nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)

	if add_bias_kv:
	self.bias_k = nn.Parameter(torch.Tensor(1, 1, embed_dim))
	self.bias_v = nn.Parameter(torch.Tensor(1, 1, embed_dim))
	else:
	self.bias_k = self.bias_v = None

	self.add_zero_attn = add_zero_attn

	self.reset_parameters()

	if freeze_compress == 1:
	self.compress_k.weight.requires_grad = False
	if shared_kv_compressed == 0:
	self.compress_v.weight.requires_grad = False

	self.onnx_trace = False
	def reset_parameters(self):
	if self.qkv_same_dim:
	# Empirically observed the convergence to be much better with
	# the scaled initialization
	nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
	nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
	nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
	if (
	not self.layerwise_sharing
	): # otherwise, we already initialize the parameters
	nn.init.xavier_uniform_(self.compress_k.weight, gain=1 / math.sqrt(2))
	if self.shared_kv_compressed == 0:
	nn.init.xavier_uniform_(
	self.compress_v.weight, gain=1 / math.sqrt(2)
	)
	else:
	nn.init.xavier_uniform_(self.k_proj.weight)
	nn.init.xavier_uniform_(self.v_proj.weight)
	nn.init.xavier_uniform_(self.q_proj.weight)
	if (
	not self.layerwise_sharing
	): # otherwise, we already initialize the parameters
	nn.init.xavier_uniform_(self.compress_k.weight)
	if self.shared_kv_compressed == 0:
	nn.init.xavier_uniform_(self.compress_v.weight)

	nn.init.xavier_uniform_(self.out_proj.weight)
	if self.out_proj.bias is not None:
	nn.init.constant_(self.out_proj.bias, 0.0)
	if self.bias_k is not None:
	nn.init.xavier_normal_(self.bias_k)
	if self.bias_v is not None:
	nn.init.xavier_normal_(self.bias_v)

	def prepare_for_onnx_export_(self):
	self.onnx_trace = True

	def forward(
	self,
	query,
	key: Optional[torch.Tensor],
	value: Optional[torch.Tensor],
	key_padding_mask: Optional[torch.Tensor] = None,
	incremental_state: Optional[Dict[str, Dict[str, Optional[torch.Tensor]]]] = None,
	output_attentions: bool = True,
	need_weights: bool = True,
	static_kv: bool = False,
	attn_mask: Optional[torch.Tensor] = None,
	before_softmax: bool = False,
	need_head_weights: bool = False,
	past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""Input shape: Time x Batch x Channel

	Args:
	key_padding_mask (ByteTensor, optional): mask to exclude
	keys that are pads, of shape `(batch, src_len)`, where
	padding elements are indicated by 1s.
	need_weights (bool, optional): return the attention weights,
	averaged over heads (default: False).
	attn_mask (ByteTensor, optional): typically used to
	implement causal attention, where the mask prevents the
	attention from looking forward in time (default: None).
	before_softmax (bool, optional): return the raw attention
	weights and values before the attention softmax.
	need_head_weights (bool, optional): return the attention
	weights for each head. Implies need_weights. Default:
	return the average attention weights over all heads.
	"""

	if need_head_weights:
	need_weights = True

	tgt_len, bsz, embed_dim = query.size()
	assert embed_dim == self.embed_dim
	assert list(query.size()) == [tgt_len, bsz, embed_dim]

	if incremental_state is not None:
	saved_state = self._get_input_buffer(incremental_state)
	if saved_state is not None and "prev_key" in saved_state:
	# previous time steps are cached - no need to recompute
	# key and value if they are static
	if static_kv:
	assert self.encoder_decoder_attention and not self.self_attention
	key = value = None
	else:
	saved_state = None

	if self.self_attention:
	q = self.q_proj(query)

	k_input = query.permute(1, 2, 0).contiguous() # B * C * T
	k_input = (
	F.linear(k_input, self.compress_k.weight[:, 0:tgt_len])
	.permute(2, 0, 1)
	.contiguous()
	)
	k = self.k_proj(k_input)

	v_input = query.permute(1, 2, 0).contiguous() # B * C * T
	if self.shared_kv_compressed == 0:
	v_input = (
	F.linear(v_input, self.compress_v.weight[:, 0:tgt_len])
	.permute(2, 0, 1)
	.contiguous()
	)
	if self.shared_kv_compressed == 1: # use shared kv compressed linear layer
	v_input = (
	F.linear(v_input, self.compress_k.weight[:, 0:tgt_len])
	.permute(2, 0, 1)
	.contiguous()
	)
	v = self.v_proj(v_input)
	elif self.encoder_decoder_attention:
	# encoder-decoder attention
	q = self.q_proj(query)
	if key is None:
	assert value is None
	k = v = None
	else:
	k = self.k_proj(key)
	v = self.v_proj(key)

	else:
	assert key is not None and value is not None
	q = self.q_proj(query)
	k = self.k_proj(key)
	v = self.v_proj(value)
	q *= self.scaling

	if self.bias_k is not None:
	assert self.bias_v is not None
	k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
	v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
	if attn_mask is not None:
	attn_mask = torch.cat(
	[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
	)
	if key_padding_mask is not None:
	key_padding_mask = torch.cat(
	[
	key_padding_mask,
	key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
	],
	dim=1,
	)

	q = (
	q.contiguous()
	.view(tgt_len, bsz * self.num_heads, self.head_dim)
	.transpose(0, 1)
	)
	if k is not None:
	k = (
	k.contiguous()
	.view(-1, bsz * self.num_heads, self.head_dim)
	.transpose(0, 1)
	)
	if v is not None:
	v = (
	v.contiguous()
	.view(-1, bsz * self.num_heads, self.head_dim)
	.transpose(0, 1)
	)

	if saved_state is not None:
	# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
	if "prev_key" in saved_state:
	_prev_key = saved_state["prev_key"]
	assert _prev_key is not None
	prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
	if static_kv:
	k = prev_key
	else:
	assert k is not None
	k = torch.cat([prev_key, k], dim=1)
	if "prev_value" in saved_state:
	_prev_value = saved_state["prev_value"]
	assert _prev_value is not None
	prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
	if static_kv:
	v = prev_value
	else:
	assert v is not None
	v = torch.cat([prev_value, v], dim=1)
	prev_key_padding_mask: Optional[torch.Tensor] = None
	if "prev_key_padding_mask" in saved_state:
	prev_key_padding_mask = saved_state["prev_key_padding_mask"]
	assert k is not None and v is not None
	key_padding_mask = MultiheadLinearAttention._append_prev_key_padding_mask(
	key_padding_mask=key_padding_mask,
	prev_key_padding_mask=prev_key_padding_mask,
	batch_size=bsz,
	src_len=k.size(1),
	static_kv=static_kv,
	)

	saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
	saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
	saved_state["prev_key_padding_mask"] = key_padding_mask
	# In this branch incremental_state is never None
	assert incremental_state is not None
	incremental_state = self._set_input_buffer(incremental_state, saved_state)
	assert k is not None
	src_len = k.size(1)

	if self.add_zero_attn:
	assert v is not None
	src_len += 1
	k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
	v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
	if attn_mask is not None:
	attn_mask = torch.cat(
	[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
	)

	attn_weights = torch.bmm(q, k.transpose(1, 2))
	attn_weights = MultiheadLinearAttention.apply_sparse_mask(
	attn_weights, tgt_len, src_len, bsz
	)

	assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

	if attn_mask is not None:
	attn_mask = attn_mask.unsqueeze(0)
	if self.onnx_trace:
	attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
	attn_weights += attn_mask

	if before_softmax:
	return attn_weights, v

	attn_weights_float = utils.softmax(
	attn_weights, dim=-1, onnx_trace=self.onnx_trace
	)
	attn_weights = attn_weights_float.type_as(attn_weights)
	attn_probs = F.dropout(
	attn_weights,
	p=self.dropout,
	training=self.training,
	)
	assert v is not None
	attn = torch.bmm(attn_probs, v)
	assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
	if self.onnx_trace and attn.size(1) == 1:
	# when ONNX tracing a single decoder step (sequence length == 1)
	# the transpose is a no-op copy before view, thus unnecessary
	attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
	else:
	attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
	attn = self.out_proj(attn)
	attn_weights: Optional[torch.Tensor] = None
	if output_attentions:
	attn_weights = attn_weights_float.view(
	bsz, self.num_heads, tgt_len, src_len
	).transpose(1, 0)
	if not need_head_weights:
	# average attention weights over heads
	attn_weights = attn_weights.mean(dim=0)


	return attn, attn_weights

	@staticmethod
	def _append_prev_key_padding_mask(
	key_padding_mask: Optional[torch.Tensor],
	prev_key_padding_mask: Optional[torch.Tensor],
	batch_size: int,
	src_len: int,
	static_kv: bool,
	) -> Optional[torch.Tensor]:
	# saved key padding masks have shape (bsz, seq_len)
	if prev_key_padding_mask is not None and static_kv:
	new_key_padding_mask = prev_key_padding_mask
	elif prev_key_padding_mask is not None and key_padding_mask is not None:
	new_key_padding_mask = torch.cat(
	[prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
	)
	# During incremental decoding, as the padding token enters and
	# leaves the frame, there will be a time when prev or current
	# is None
	elif prev_key_padding_mask is not None:
	filler = torch.zeros(
	(batch_size, src_len - prev_key_padding_mask.size(1)),
	device=prev_key_padding_mask.device,
	)
	new_key_padding_mask = torch.cat(
	[prev_key_padding_mask.float(), filler.float()], dim=1
	)
	elif key_padding_mask is not None:
	filler = torch.zeros(
	(batch_size, src_len - key_padding_mask.size(1)),
	device=key_padding_mask.device,
	)
	new_key_padding_mask = torch.cat(
	[filler.float(), key_padding_mask.float()], dim=1
	)
	else:
	new_key_padding_mask = prev_key_padding_mask
	return new_key_padding_mask

	@torch.jit.export
	def reorder_incremental_state(
	self,
	incremental_state: Dict[str, Dict[str, Optional[torch.Tensor]]],
	new_order: torch.Tensor,
	):
	"""Reorder buffered internal state (for incremental generation)."""
	input_buffer = self._get_input_buffer(incremental_state)
	if input_buffer is not None:
	for k in input_buffer.keys():
	input_buffer_k = input_buffer[k]
	if input_buffer_k is not None:
	if self.encoder_decoder_attention and input_buffer_k.size(
	0
	) == new_order.size(0):
	break
	input_buffer[k] = input_buffer_k.index_select(0, new_order)
	incremental_state = self._set_input_buffer(incremental_state, input_buffer)
	return incremental_state

	def _get_input_buffer(
	self, incremental_state: Optional[Dict[str, Dict[str, Optional[torch.Tensor]]]]
	) -> Dict[str, Optional[torch.Tensor]]:
	result = self.get_incremental_state(incremental_state, "attn_state")
	if result is not None:
	return result
	else:
	empty_result: Dict[str, Optional[torch.Tensor]] = {}
	return empty_result

	def _set_input_buffer(
	self,
	incremental_state: Dict[str, Dict[str, Optional[torch.Tensor]]],
	buffer: Dict[str, Optional[torch.Tensor]],
	):
	return self.set_incremental_state(incremental_state, "attn_state", buffer)

	def apply_sparse_mask(attn_weights, tgt_len: int, src_len: int, bsz: int):
	return attn_weights

	def upgrade_state_dict_named(self, state_dict, name):
	prefix = name + "." if name != "" else ""
	items_to_add = {}
	keys_to_remove = []
	for k in state_dict.keys():
	if k.endswith(prefix + "in_proj_weight"):
	# in_proj_weight used to be q + k + v with same dimensions
	dim = int(state_dict[k].shape[0] / 3)
	items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
	items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
	items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]

	keys_to_remove.append(k)

	k_bias = prefix + "in_proj_bias"
	if k_bias in state_dict.keys():
	dim = int(state_dict[k].shape[0] / 3)
	items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
	items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
	dim : 2 * dim
	]
	items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]

	keys_to_remove.append(prefix + "in_proj_bias")

	for k in keys_to_remove:
	del state_dict[k]

	for key, value in items_to_add.items():
	state_dict[key] = value



	def apply_chunking_to_forward(
	forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
	) -> torch.Tensor:
	"""
	This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension
	`chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.

	If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly
	applying `forward_fn` to `input_tensors`.

	Args:
	forward_fn (`Callable[..., torch.Tensor]`):
	The forward function of the model.
	chunk_size (`int`):
	The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
	chunk_dim (`int`):
	The dimension over which the `input_tensors` should be chunked.
	input_tensors (`Tuple[torch.Tensor]`):
	The input tensors of `forward_fn` which will be chunked

	Returns:
	`torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.


	Examples:

	```python
	# rename the usual forward() fn to forward_chunk()
	def forward_chunk(self, hidden_states):
	hidden_states = self.decoder(hidden_states)
	return hidden_states


	# implement a chunked forward function
	def forward(self, hidden_states):
	return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
	```"""
	assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"

	# inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
	num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
	if num_args_in_forward_chunk_fn != len(input_tensors):
	raise ValueError(
	f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
	"tensors are given"
	)

	if chunk_size > 0:
	tensor_shape = input_tensors[0].shape[chunk_dim]
	for input_tensor in input_tensors:
	if input_tensor.shape[chunk_dim] != tensor_shape:
	raise ValueError(
	f"All input tenors have to be of the same shape: {tensor_shape}, "
	f"found shape {input_tensor.shape[chunk_dim]}"
	)

	if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
	raise ValueError(
	f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
	f"size {chunk_size}"
	)

	num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size

	# chunk input tensor into tuples
	input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
	# apply forward fn to every tuple
	output_chunks = tuple(forward_fn(input_tensors_chunk) for input_tensors_chunk in zip(input_tensors_chunks))
	# concatenate output at same dimension
	return torch.cat(output_chunks, dim=chunk_dim)

	return forward_fn(*input_tensors)