clonar-voz / TTS /tts /layers /generic /time_depth_sep_conv.py
Shadhil's picture
voice-clone with single audio sample input
9b2107c
import torch
from torch import nn
class TimeDepthSeparableConv(nn.Module):
"""Time depth separable convolution as in https://arxiv.org/pdf/1904.02619.pdf
It shows competative results with less computation and memory footprint."""
def __init__(self, in_channels, hid_channels, out_channels, kernel_size, bias=True):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hid_channels = hid_channels
self.kernel_size = kernel_size
self.time_conv = nn.Conv1d(
in_channels,
2 * hid_channels,
kernel_size=1,
stride=1,
padding=0,
bias=bias,
)
self.norm1 = nn.BatchNorm1d(2 * hid_channels)
self.depth_conv = nn.Conv1d(
hid_channels,
hid_channels,
kernel_size,
stride=1,
padding=(kernel_size - 1) // 2,
groups=hid_channels,
bias=bias,
)
self.norm2 = nn.BatchNorm1d(hid_channels)
self.time_conv2 = nn.Conv1d(
hid_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
bias=bias,
)
self.norm3 = nn.BatchNorm1d(out_channels)
def forward(self, x):
x_res = x
x = self.time_conv(x)
x = self.norm1(x)
x = nn.functional.glu(x, dim=1)
x = self.depth_conv(x)
x = self.norm2(x)
x = x * torch.sigmoid(x)
x = self.time_conv2(x)
x = self.norm3(x)
x = x_res + x
return x
class TimeDepthSeparableConvBlock(nn.Module):
def __init__(self, in_channels, hid_channels, out_channels, num_layers, kernel_size, bias=True):
super().__init__()
assert (kernel_size - 1) % 2 == 0
assert num_layers > 1
self.layers = nn.ModuleList()
layer = TimeDepthSeparableConv(
in_channels, hid_channels, out_channels if num_layers == 1 else hid_channels, kernel_size, bias
)
self.layers.append(layer)
for idx in range(num_layers - 1):
layer = TimeDepthSeparableConv(
hid_channels,
hid_channels,
out_channels if (idx + 1) == (num_layers - 1) else hid_channels,
kernel_size,
bias,
)
self.layers.append(layer)
def forward(self, x, mask):
for layer in self.layers:
x = layer(x * mask)
return x