feat: added model

36a67ca 23 days ago

No virus

17.2 kB

	import copy
	import math

	import numpy as np
	import scipy
	import torch
	from torch import nn
	from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
	from torch.nn import functional as F
	from torch.nn.utils import remove_weight_norm, weight_norm

	from tools import commons
	from tools.commons import get_padding, init_weights
	from tools.transforms import piecewise_rational_quadratic_transform


	LRELU_SLOPE = 0.1


	class LayerNorm(nn.Module):
	def __init__(self, channels, eps=1e-5):
	super().__init__()
	self.channels = channels
	self.eps = eps

	self.gamma = nn.Parameter(torch.ones(channels))
	self.beta = nn.Parameter(torch.zeros(channels))

	def forward(self, x):
	x = x.transpose(1, -1).contiguous()
	x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
	return x.transpose(1, -1).contiguous()


	class ConvReluNorm(nn.Module):
	def __init__(
	self,
	in_channels,
	hidden_channels,
	out_channels,
	kernel_size,
	n_layers,
	p_dropout,
	):
	super().__init__()
	self.in_channels = in_channels
	self.hidden_channels = hidden_channels
	self.out_channels = out_channels
	self.kernel_size = kernel_size
	self.n_layers = n_layers
	self.p_dropout = p_dropout
	assert n_layers > 1, "Number of layers should be larger than 0."

	self.conv_layers = nn.ModuleList()
	self.norm_layers = nn.ModuleList()
	self.conv_layers.append(
	nn.Conv1d(
	in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
	)
	)
	self.norm_layers.append(LayerNorm(hidden_channels))
	self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
	for _ in range(n_layers - 1):
	self.conv_layers.append(
	nn.Conv1d(
	hidden_channels,
	hidden_channels,
	kernel_size,
	padding=kernel_size // 2,
	)
	)
	self.norm_layers.append(LayerNorm(hidden_channels))
	self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
	self.proj.weight.data.zero_()
	self.proj.bias.data.zero_()

	def forward(self, x, x_mask):
	x_org = x
	for i in range(self.n_layers):
	x = self.conv_layers[i](x * x_mask)
	x = self.norm_layers[i](x)
	x = self.relu_drop(x)
	x = x_org + self.proj(x)
	return x * x_mask


	class DDSConv(nn.Module):
	"""
	Dialted and Depth-Separable Convolution
	"""

	def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
	super().__init__()
	self.channels = channels
	self.kernel_size = kernel_size
	self.n_layers = n_layers
	self.p_dropout = p_dropout

	self.drop = nn.Dropout(p_dropout)
	self.convs_sep = nn.ModuleList()
	self.convs_1x1 = nn.ModuleList()
	self.norms_2 = nn.ModuleList()
	for i in range(n_layers):
	dilation = kernel_size**i

	padding = (kernel_size * dilation - dilation) // 2
	conv = nn.Conv1d(
	channels,
	channels,
	kernel_size,
	groups=channels,
	dilation=dilation,
	padding=padding,
	)
	self.convs_sep.append(conv)
	self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
	self.norms_2.append(LayerNorm(channels))

	def forward(self, x, x_mask, g=None):
	if g is not None:
	x = x + g
	for i in range(self.n_layers):
	y = self.convs_sep[i](x * x_mask)
	y = F.gelu(y)
	y = self.convs_1x1[i](y)
	y = self.norms_2[i](y)
	y = F.gelu(y)
	y = self.drop(y)
	x = x + y
	return x * x_mask


	class WN(torch.nn.Module):
	def __init__(
	self,
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	gin_channels=0,
	p_dropout=0,
	):
	super(WN, self).__init__()
	assert kernel_size % 2 == 1
	self.hidden_channels = hidden_channels
	self.kernel_size = (kernel_size,)
	self.dilation_rate = dilation_rate
	self.n_layers = n_layers
	self.gin_channels = gin_channels
	self.p_dropout = p_dropout

	self.in_layers = torch.nn.ModuleList()
	self.res_skip_layers = torch.nn.ModuleList()
	self.drop = nn.Dropout(p_dropout)

	if gin_channels != 0:
	cond_layer = torch.nn.Conv1d(
	gin_channels, 2 * hidden_channels * n_layers, 1
	)
	self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")

	for i in range(n_layers):
	dilation = dilation_rate**i
	padding = int((kernel_size * dilation - dilation) / 2)

	in_layer = Conv1d(
	hidden_channels,
	2 * hidden_channels,
	kernel_size,
	padding=padding,
	dilation=dilation,
	)
	in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
	self.in_layers.append(in_layer)

	# last one is not necessary
	if i < n_layers - 1:
	res_skip_channels = 2 * hidden_channels
	else:
	res_skip_channels = hidden_channels

	res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
	res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
	self.res_skip_layers.append(res_skip_layer)

	def forward(self, x, x_mask, g=None, **kwargs):
	output = torch.zeros_like(x)
	n_channels_tensor = torch.IntTensor([self.hidden_channels])

	if g is not None:
	g = self.cond_layer(g)

	for i in range(self.n_layers):
	x_in = self.in_layers[i](x)
	if g is not None:
	cond_offset = i * 2 * self.hidden_channels
	g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
	else:
	g_l = torch.zeros_like(x_in)

	acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
	acts = self.drop(acts)

	res_skip_acts = self.res_skip_layers[i](acts)
	if i < self.n_layers - 1:
	res_acts = res_skip_acts[:, : self.hidden_channels, :]
	x = (x + res_acts) * x_mask
	output = output + res_skip_acts[:, self.hidden_channels :, :]
	else:
	output = output + res_skip_acts
	return output * x_mask

	def remove_weight_norm(self):
	if self.gin_channels != 0:
	torch.nn.utils.remove_weight_norm(self.cond_layer)
	for l in self.in_layers:
	torch.nn.utils.remove_weight_norm(l)
	for l in self.res_skip_layers:
	torch.nn.utils.remove_weight_norm(l)


	class Log(nn.Module):
	def forward(self, x, x_mask, reverse=False, **kwargs):
	if not reverse:
	y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
	logdet = torch.sum(-y, [1, 2])
	return y, logdet
	else:
	x = torch.exp(x) * x_mask
	return x


	class Flip(nn.Module):
	def forward(self, x, args, reverse=False, *kwargs):
	x = torch.flip(x, [1])
	if not reverse:
	logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
	return x, logdet
	else:
	return x


	class ElementwiseAffine(nn.Module):
	def __init__(self, channels):
	super().__init__()
	self.channels = channels
	self.m = nn.Parameter(torch.zeros(channels, 1))
	self.logs = nn.Parameter(torch.zeros(channels, 1))

	def forward(self, x, x_mask, reverse=False, **kwargs):
	if not reverse:
	y = self.m + torch.exp(self.logs) * x
	y = y * x_mask
	logdet = torch.sum(self.logs * x_mask, [1, 2])
	return y, logdet
	else:
	x = (x - self.m) * torch.exp(-self.logs) * x_mask
	return x


	class ResidualCouplingLayer(nn.Module):
	def __init__(
	self,
	channels,
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	p_dropout=0,
	gin_channels=0,
	mean_only=False,
	):
	assert channels % 2 == 0, "channels should be divisible by 2"
	super().__init__()
	self.channels = channels
	self.hidden_channels = hidden_channels
	self.kernel_size = kernel_size
	self.dilation_rate = dilation_rate
	self.n_layers = n_layers
	self.half_channels = channels // 2
	self.mean_only = mean_only

	self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
	self.enc = WN(
	hidden_channels,
	kernel_size,
	dilation_rate,
	n_layers,
	p_dropout=p_dropout,
	gin_channels=gin_channels,
	)
	self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
	self.post.weight.data.zero_()
	self.post.bias.data.zero_()

	def forward(self, x, x_mask, g=None, reverse=False):
	x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
	h = self.pre(x0) * x_mask
	h = self.enc(h, x_mask, g=g)
	stats = self.post(h) * x_mask
	if not self.mean_only:
	m, logs = torch.split(stats, [self.half_channels] * 2, 1)
	else:
	m = stats
	logs = torch.zeros_like(m)

	if not reverse:
	x1 = m + x1 * torch.exp(logs) * x_mask
	x = torch.cat([x0, x1], 1)
	logdet = torch.sum(logs, [1, 2])
	return x, logdet
	else:
	x1 = (x1 - m) * torch.exp(-logs) * x_mask
	x = torch.cat([x0, x1], 1)
	return x


	class ConvFlow(nn.Module):
	def __init__(
	self,
	in_channels,
	filter_channels,
	kernel_size,
	n_layers,
	num_bins=10,
	tail_bound=5.0,
	):
	super().__init__()
	self.in_channels = in_channels
	self.filter_channels = filter_channels
	self.kernel_size = kernel_size
	self.n_layers = n_layers
	self.num_bins = num_bins
	self.tail_bound = tail_bound
	self.half_channels = in_channels // 2

	self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
	self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
	self.proj = nn.Conv1d(
	filter_channels, self.half_channels * (num_bins * 3 - 1), 1
	)
	self.proj.weight.data.zero_()
	self.proj.bias.data.zero_()

	def forward(self, x, x_mask, g=None, reverse=False):
	x0, x1 = torch.split(x, [self.half_channels] * 2, 1)

	h = self.pre(x0)
	h = self.convs(h, x_mask, g=g)
	h = self.proj(h) * x_mask

	b, c, t = x0.shape
	h = (
	h.reshape(b, c, -1, t).permute(0, 1, 3, 2).contiguous()
	) # [b, cx?, t] -> [b, c, t, ?]

	unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
	unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
	self.filter_channels
	)
	unnormalized_derivatives = h[..., 2 * self.num_bins :]

	x1, logabsdet = piecewise_rational_quadratic_transform(
	x1,
	unnormalized_widths,
	unnormalized_heights,
	unnormalized_derivatives,
	inverse=reverse,
	tails="linear",
	tail_bound=self.tail_bound,
	)

	x = torch.cat([x0, x1], 1) * x_mask
	logdet = torch.sum(logabsdet * x_mask, [1, 2])
	if not reverse:
	return x, logdet
	else:
	return x


	class LinearNorm(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	bias=True,
	spectral_norm=False,
	):
	super(LinearNorm, self).__init__()
	self.fc = nn.Linear(in_channels, out_channels, bias)

	if spectral_norm:
	self.fc = nn.utils.spectral_norm(self.fc)

	def forward(self, input):
	out = self.fc(input)
	return out


	class Mish(nn.Module):
	def __init__(self):
	super(Mish, self).__init__()

	def forward(self, x):
	return x * torch.tanh(F.softplus(x))


	class LinearNorm(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	bias=True,
	spectral_norm=False,
	):
	super(LinearNorm, self).__init__()
	self.fc = nn.Linear(in_channels, out_channels, bias)

	if spectral_norm:
	self.fc = nn.utils.spectral_norm(self.fc)

	def forward(self, input):
	out = self.fc(input)
	return out


	class ConvNorm(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	kernel_size=1,
	stride=1,
	padding=None,
	dilation=1,
	bias=True,
	spectral_norm=False,
	):
	super(ConvNorm, self).__init__()

	if padding is None:
	assert kernel_size % 2 == 1
	padding = int(dilation * (kernel_size - 1) / 2)

	self.conv = torch.nn.Conv1d(
	in_channels,
	out_channels,
	kernel_size=kernel_size,
	stride=stride,
	padding=padding,
	dilation=dilation,
	bias=bias,
	)

	if spectral_norm:
	self.conv = nn.utils.spectral_norm(self.conv)

	def forward(self, input):
	out = self.conv(input)
	return out


	class MultiHeadAttention(nn.Module):
	"""Multi-Head Attention module"""

	def __init__(self, n_head, d_model, d_k, d_v, dropout=0.0, spectral_norm=False):
	super().__init__()

	self.n_head = n_head
	self.d_k = d_k
	self.d_v = d_v

	self.w_qs = nn.Linear(d_model, n_head * d_k)
	self.w_ks = nn.Linear(d_model, n_head * d_k)
	self.w_vs = nn.Linear(d_model, n_head * d_v)

	self.attention = ScaledDotProductAttention(
	temperature=np.power(d_model, 0.5), dropout=dropout
	)

	self.fc = nn.Linear(n_head * d_v, d_model)
	self.dropout = nn.Dropout(dropout)

	if spectral_norm:
	self.w_qs = nn.utils.spectral_norm(self.w_qs)
	self.w_ks = nn.utils.spectral_norm(self.w_ks)
	self.w_vs = nn.utils.spectral_norm(self.w_vs)
	self.fc = nn.utils.spectral_norm(self.fc)

	def forward(self, x, mask=None):
	d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
	sz_b, len_x, _ = x.size()

	residual = x

	q = self.w_qs(x).view(sz_b, len_x, n_head, d_k)
	k = self.w_ks(x).view(sz_b, len_x, n_head, d_k)
	v = self.w_vs(x).view(sz_b, len_x, n_head, d_v)
	q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_k) # (n*b) x lq x dk
	k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_k) # (n*b) x lk x dk
	v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_x, d_v) # (n*b) x lv x dv

	if mask is not None:
	slf_mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
	else:
	slf_mask = None
	output, attn = self.attention(q, k, v, mask=slf_mask)

	output = output.view(n_head, sz_b, len_x, d_v)
	output = (
	output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1)
	) # b x lq x (n*dv)

	output = self.fc(output)

	output = self.dropout(output) + residual
	return output, attn


	class ScaledDotProductAttention(nn.Module):
	"""Scaled Dot-Product Attention"""

	def __init__(self, temperature, dropout):
	super().__init__()
	self.temperature = temperature
	self.softmax = nn.Softmax(dim=2)
	self.dropout = nn.Dropout(dropout)

	def forward(self, q, k, v, mask=None):
	attn = torch.bmm(q, k.transpose(1, 2).contiguous())
	attn = attn / self.temperature

	if mask is not None:
	attn = attn.masked_fill(mask, -np.inf)

	attn = self.softmax(attn)
	p_attn = self.dropout(attn)

	output = torch.bmm(p_attn, v)
	return output, attn


	class Conv1dGLU(nn.Module):
	"""
	Conv1d + GLU(Gated Linear Unit) with residual connection.
	For GLU refer to https://arxiv.org/abs/1612.08083 paper.
	"""

	def __init__(self, in_channels, out_channels, kernel_size, dropout):
	super(Conv1dGLU, self).__init__()
	self.out_channels = out_channels
	self.conv1 = ConvNorm(in_channels, 2 * out_channels, kernel_size=kernel_size)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	residual = x
	x = self.conv1(x)
	x1, x2 = torch.split(x, split_size_or_sections=self.out_channels, dim=1)
	x = x1 * torch.sigmoid(x2)
	x = residual + self.dropout(x)
	return x