Spaces:

ASLP-lab
/

OSUM

Running on Zero

tomxxie

适配zeroGPU

568e264 2 months ago

8.29 kB

	from typing import Optional, Tuple
	from wenet.transformer.attention import MultiHeadedAttention
	from torch import nn
	import math
	import torch


	class MultiHeadedAttentionSANM(MultiHeadedAttention):
	"""Multi-Head Attention layer.
	Args:
	n_head (int): The number of heads.
	n_feat (int): The number of features.
	dropout_rate (float): Dropout rate.
	"""

	def __init__(self,
	n_head,
	in_feat,
	n_feat,
	dropout_rate,
	kernel_size,
	sanm_shfit=0):
	"""Construct an MultiHeadedAttention object."""
	super().__init__(n_head, n_feat, dropout_rate)
	# We assume d_v always equals d_k
	# self.linear_q = nn.Linear(n_feat, n_feat)
	# self.linear_k = nn.Linear(n_feat, n_feat)
	# self.linear_v = nn.Linear(n_feat, n_feat)
	del self.linear_q, self.linear_k, self.linear_v
	self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)

	self.fsmn_block = nn.Conv1d(n_feat,
	n_feat,
	kernel_size,
	stride=1,
	padding=0,
	groups=n_feat,
	bias=False)
	# padding
	self.left_padding = (kernel_size - 1) // 2
	if sanm_shfit > 0:
	self.left_padding = self.left_padding + sanm_shfit
	self.right_padding = kernel_size - 1 - self.left_padding
	self.pad_fn = nn.ConstantPad1d((self.left_padding, self.right_padding),
	0.0)

	def forward_qkv(
	self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

	x = query
	b, t, _ = x.size()
	q_k_v = self.linear_q_k_v(x)
	q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
	q = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(
	1, 2) # (batch, head, time1, d_k)
	k = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(
	1, 2) # (batch, head, time2, d_k)
	v = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(
	1, 2) # (batch, head, time2, d_k)

	return q, k, v

	def forward_fsmn(self,
	inputs: torch.Tensor,
	mask: torch.Tensor,
	mask_shfit_chunk: Optional[torch.Tensor] = None):
	b, _, t, _ = inputs.size()
	inputs = inputs.transpose(1, 2).view(b, t, -1)
	if mask.size(2) > 0: # time2 > 0
	# TODO(Mddct): make sure mask is right
	if mask_shfit_chunk is not None:
	mask = mask * mask_shfit_chunk
	mask = mask.transpose(1, 2) # [B,T,1]
	inputs = inputs * mask
	x = inputs.transpose(1, 2)
	# x = torch.nn.functional.pad(x, (self.left_padding, self.right_padding),
	# value=0.0,
	# mode='constant')
	x = self.pad_fn(x)
	x = self.fsmn_block(x)
	x = x.transpose(1, 2)
	x += inputs
	x = self.dropout(x)
	return x * mask

	def forward(
	self,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
	mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
	pos_emb: torch.Tensor = torch.empty(0),
	cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
	mask_shfit_chunk: Optional[torch.Tensor] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	q, k, v = self.forward_qkv(query, key, value)
	if cache.size(0) > 0:
	key_cache, value_cache = torch.split(cache,
	cache.size(-1) // 2,
	dim=-1)
	k = torch.cat([key_cache, k], dim=2)
	v = torch.cat([value_cache, v], dim=2)
	# NOTE(Mddct): we need know fsmn_memory's cache, but paraformer is nonstreamming
	# refactor later if streaming model is available
	new_cache = torch.cat((k, v), dim=-1)
	fsmn_memory = self.forward_fsmn(v,
	mask=mask_pad,
	mask_shfit_chunk=mask_shfit_chunk)

	scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
	att = self.forward_attention(v, scores, mask)
	return att + fsmn_memory, new_cache


	class DummyMultiHeadSANM(MultiHeadedAttentionSANM):
	"""A dummy multihead attention for Paraformer befroe cross attention
	"""

	def __init__(self,
	n_head,
	in_feat,
	n_feat,
	dropout_rate,
	kernel_size,
	sanm_shfit=0):
	super().__init__(n_head, in_feat, n_feat, dropout_rate, kernel_size,
	sanm_shfit)
	del self.linear_q_k_v
	del self.linear_out

	def forward(
	self,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
	mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
	pos_emb: torch.Tensor = torch.empty(0),
	cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
	mask_shfit_chunk: Optional[torch.Tensor] = None
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	query = query * mask_pad.transpose(1, 2)
	inputs = query
	x = inputs.transpose(1, 2)
	x = self.pad_fn(x)
	# TODO(Mddct): cache here for future streaming
	cache: Optional[torch.Tensor] = None
	x = self.fsmn_block(x)
	x = x.transpose(1, 2)
	if x.size(1) != inputs.size(1):
	inputs = inputs[:, -1, :]

	x = x + inputs
	x = self.dropout(x)
	x = x * mask_pad.transpose(1, 2)
	return x, cache


	class MultiHeadAttentionCross(MultiHeadedAttentionSANM):

	def __init__(self,
	n_head,
	in_feat,
	n_feat,
	dropout_rate,
	kernel_size,
	sanm_shfit=0,
	target_size: Optional[int] = None):
	super().__init__(n_head, in_feat, n_feat, dropout_rate, kernel_size,
	sanm_shfit)
	del self.linear_q_k_v
	del self.fsmn_block
	self.linear_q = nn.Linear(n_feat, n_feat)
	self.linear_k_v = nn.Linear(
	n_feat if target_size is None else target_size, n_feat * 2)

	def forward_qkv(
	self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	# NOTE(Mddct): here value == key
	_ = value

	x = query
	b = x.size(0)
	q = self.linear_q(x)
	q_h = torch.reshape(q, (b, -1, self.h, self.d_k)).transpose(
	1, 2) # (batch, head, time1, d_k)

	k_v = self.linear_k_v(key)
	k, v = torch.split(k_v, int(self.h * self.d_k), dim=-1)
	k_h = torch.reshape(k, (b, -1, self.h, self.d_k)).transpose(
	1, 2) # (batch, head, time2, d_k)
	v_h = torch.reshape(v, (b, -1, self.h, self.d_k)).transpose(
	1, 2) # (batch, head, time2, d_k)

	return q_h, k_h, v_h

	def forward(
	self,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
	mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
	pos_emb: torch.Tensor = torch.empty(0),
	cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
	mask_shfit_chunk: Optional[torch.Tensor] = None
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	q, k, v = self.forward_qkv(query, key, key)
	q = q * self.d_k**(-0.5)
	scores = torch.matmul(q, k.transpose(-2, -1))

	# TODO(Mddct): support future streaming paraformer
	cache: Optional[torch.Tensor] = None
	return self.forward_attention(v, scores, mask), cache