Spaces:

KdaiP
/

StableTTS_en-demo

Running

App Files Files Community

StableTTS_en-demo / vocos_pytorch /models /discriminator.py

KdaiP

Upload 238 files

d358e26 verified 6 months ago

raw

history blame contribute delete

5.97 kB

	from typing import List, Tuple

	import torch
	from torch import nn
	from torch import Tensor
	from torch.nn import Conv2d
	from torch.nn.utils import weight_norm
	from torchaudio.transforms import Spectrogram


	class MultiPeriodDiscriminator(nn.Module):
	def __init__(self, periods: Tuple[int, ...] = (2, 3, 5, 7, 11)):
	super().__init__()
	self.discriminators = nn.ModuleList([DiscriminatorP(period=p) for p in periods])

	def forward(self, y: Tensor, y_hat: Tensor):
	y_d_rs = []
	y_d_gs = []
	fmap_rs = []
	fmap_gs = []
	for d in self.discriminators:
	y_d_r, fmap_r = d(y)
	y_d_g, fmap_g = d(y_hat)
	y_d_rs.append(y_d_r)
	fmap_rs.append(fmap_r)
	y_d_gs.append(y_d_g)
	fmap_gs.append(fmap_g)

	return y_d_rs, y_d_gs, fmap_rs, fmap_gs


	class DiscriminatorP(nn.Module):
	def __init__(
	self,
	period: int,
	in_channels: int = 1,
	kernel_size: int = 5,
	stride: int = 3,
	lrelu_slope: float = 0.1,
	):
	super().__init__()
	self.period = period
	self.convs = nn.ModuleList(
	[
	weight_norm(Conv2d(in_channels, 32, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
	weight_norm(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
	weight_norm(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
	weight_norm(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
	weight_norm(Conv2d(1024, 1024, (kernel_size, 1), (1, 1), padding=(kernel_size // 2, 0))),
	]
	)

	self.conv_post = weight_norm(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
	self.lrelu_slope = lrelu_slope

	def forward(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]:
	fmap = []
	# 1d to 2d
	b, c, t = x.shape
	if t % self.period != 0: # pad first
	n_pad = self.period - (t % self.period)
	x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
	t = t + n_pad
	x = x.view(b, c, t // self.period, self.period)

	for i, l in enumerate(self.convs):
	x = l(x)
	x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
	if i > 0:
	fmap.append(x)
	x = self.conv_post(x)
	fmap.append(x)
	x = torch.flatten(x, 1, -1)

	return x, fmap


	class MultiResolutionDiscriminator(nn.Module):
	def __init__(
	self,
	fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
	):
	"""
	Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.

	Args:
	fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
	"""

	super().__init__()
	self.discriminators = nn.ModuleList(
	[DiscriminatorR(window_length=w) for w in fft_sizes]
	)

	def forward(self, y: Tensor, y_hat: Tensor) -> Tuple[List[Tensor], List[Tensor], List[List[Tensor]], List[List[Tensor]]]:
	y_d_rs = []
	y_d_gs = []
	fmap_rs = []
	fmap_gs = []

	for d in self.discriminators:
	y_d_r, fmap_r = d(x=y)
	y_d_g, fmap_g = d(x=y_hat)
	y_d_rs.append(y_d_r)
	fmap_rs.append(fmap_r)
	y_d_gs.append(y_d_g)
	fmap_gs.append(fmap_g)

	return y_d_rs, y_d_gs, fmap_rs, fmap_gs


	class DiscriminatorR(nn.Module):
	def __init__(
	self,
	window_length: int,
	channels: int = 32,
	hop_factor: float = 0.25,
	bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
	):
	super().__init__()
	self.window_length = window_length
	self.hop_factor = hop_factor
	self.spec_fn = Spectrogram(
	n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
	)
	n_fft = window_length // 2 + 1
	bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
	self.bands = bands
	convs = lambda: nn.ModuleList(
	[
	weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
	weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
	weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
	weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
	weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
	]
	)
	self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])

	self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))

	def spectrogram(self, x):
	x = x.squeeze(1)
	# Remove DC offset
	x = x - x.mean(dim=-1, keepdims=True)
	# Peak normalize the volume of input audio
	x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
	x = self.spec_fn(x)
	x = torch.view_as_real(x)
	x = x.permute(0, 3, 2, 1) # b f t c -> b c t f
	# Split into bands
	x_bands = [x[..., b[0] : b[1]] for b in self.bands]
	return x_bands

	def forward(self, x: Tensor):
	x_bands = self.spectrogram(x)
	fmap = []
	x = []
	for band, stack in zip(x_bands, self.band_convs):
	for i, layer in enumerate(stack):
	band = layer(band)
	band = torch.nn.functional.leaky_relu(band, 0.1)
	if i > 0:
	fmap.append(band)
	x.append(band)
	x = torch.cat(x, dim=-1)
	x = self.conv_post(x)
	fmap.append(x)

	return x, fmap