Spaces:
Running
on
T4
Running
on
T4
File size: 5,369 Bytes
9e275b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
"""
MIT Licensed Code
Copyright (c) 2022 Aaron (Yinghao) Li
https://github.com/yl4579/StyleTTS/blob/main/models.py
"""
import math
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn.utils import spectral_norm
class StyleEncoder(nn.Module):
def __init__(self, dim_in=128, style_dim=64, max_conv_dim=384):
super().__init__()
blocks = []
blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
repeat_num = 4
for _ in range(repeat_num):
dim_out = min(dim_in * 2, max_conv_dim)
blocks += [ResBlk(dim_in, dim_out, downsample='half')]
dim_in = dim_out
blocks += [nn.LeakyReLU(0.2)]
blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
blocks += [nn.AdaptiveAvgPool2d(1)]
blocks += [nn.LeakyReLU(0.2)]
self.shared = nn.Sequential(*blocks)
self.unshared = nn.Linear(dim_out, style_dim)
def forward(self, speech):
h = self.shared(speech.unsqueeze(1))
h = h.view(h.size(0), -1)
s = self.unshared(h)
return s
class ResBlk(nn.Module):
def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
normalize=False, downsample='none'):
super().__init__()
self.actv = actv
self.normalize = normalize
self.downsample = DownSample(downsample)
self.downsample_res = LearnedDownSample(downsample, dim_in)
self.learned_sc = dim_in != dim_out
self._build_weights(dim_in, dim_out)
def _build_weights(self, dim_in, dim_out):
self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
if self.normalize:
self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
if self.learned_sc:
self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
def _shortcut(self, x):
if self.learned_sc:
x = self.conv1x1(x)
if self.downsample:
x = self.downsample(x)
return x
def _residual(self, x):
if self.normalize:
x = self.norm1(x)
x = self.actv(x)
x = self.conv1(x)
x = self.downsample_res(x)
if self.normalize:
x = self.norm2(x)
x = self.actv(x)
x = self.conv2(x)
return x
def forward(self, x):
x = self._shortcut(x) + self._residual(x)
return x / math.sqrt(2) # unit variance
class LearnedDownSample(nn.Module):
def __init__(self, layer_type, dim_in):
super().__init__()
self.layer_type = layer_type
if self.layer_type == 'none':
self.conv = nn.Identity()
elif self.layer_type == 'timepreserve':
self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
elif self.layer_type == 'half':
self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
else:
raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
def forward(self, x):
return self.conv(x)
class LearnedUpSample(nn.Module):
def __init__(self, layer_type, dim_in):
super().__init__()
self.layer_type = layer_type
if self.layer_type == 'none':
self.conv = nn.Identity()
elif self.layer_type == 'timepreserve':
self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
elif self.layer_type == 'half':
self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
else:
raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
def forward(self, x):
return self.conv(x)
class DownSample(nn.Module):
def __init__(self, layer_type):
super().__init__()
self.layer_type = layer_type
def forward(self, x):
if self.layer_type == 'none':
return x
elif self.layer_type == 'timepreserve':
return F.avg_pool2d(x, (2, 1))
elif self.layer_type == 'half':
if x.shape[-1] % 2 != 0:
x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
return F.avg_pool2d(x, 2)
else:
raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
class UpSample(nn.Module):
def __init__(self, layer_type):
super().__init__()
self.layer_type = layer_type
def forward(self, x):
if self.layer_type == 'none':
return x
elif self.layer_type == 'timepreserve':
return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
elif self.layer_type == 'half':
return F.interpolate(x, scale_factor=2, mode='nearest')
else:
raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
|