Spaces:
Build error
Build error
File size: 5,610 Bytes
9206300 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import torch
from torch import nn
from text_to_speech.modules.commons.layers import LayerNorm
import torch.nn.functional as F
class DurationPredictor(torch.nn.Module):
def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0):
super(DurationPredictor, self).__init__()
self.offset = offset
self.conv = torch.nn.ModuleList()
self.kernel_size = kernel_size
for idx in range(n_layers):
in_chans = idim if idx == 0 else n_chans
self.conv += [torch.nn.Sequential(
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
torch.nn.ReLU(),
LayerNorm(n_chans, dim=1),
torch.nn.Dropout(dropout_rate)
)]
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus())
def forward(self, x, x_padding=None):
x = x.transpose(1, -1) # (B, idim, Tmax)
for f in self.conv:
x = f(x) # (B, C, Tmax)
if x_padding is not None:
x = x * (1 - x_padding.float())[:, None, :]
x = self.linear(x.transpose(1, -1)) # [B, T, C]
x = x * (1 - x_padding.float())[:, :, None] # (B, T, C)
x = x[..., 0] # (B, Tmax)
return x
class SyntaDurationPredictor(torch.nn.Module):
def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0):
super(SyntaDurationPredictor, self).__init__()
from text_to_speech.modules.tts.syntaspeech.syntactic_graph_encoder import GraphAuxEnc
self.graph_encoder = GraphAuxEnc(in_dim=idim, hid_dim=idim, out_dim=idim)
self.offset = offset
self.conv = torch.nn.ModuleList()
self.kernel_size = kernel_size
for idx in range(n_layers):
in_chans = idim if idx == 0 else n_chans
self.conv += [torch.nn.Sequential(
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
torch.nn.ReLU(),
LayerNorm(n_chans, dim=1),
torch.nn.Dropout(dropout_rate)
)]
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus())
def forward(self, x, x_padding=None, ph2word=None, graph_lst=None, etypes_lst=None):
x = x.transpose(1, -1) # (B, idim, Tmax)
assert ph2word is not None and graph_lst is not None and etypes_lst is not None
x_graph = self.graph_encoder(graph_lst, x, ph2word, etypes_lst)
x = x + x_graph * 1.
for f in self.conv:
x = f(x) # (B, C, Tmax)
if x_padding is not None:
x = x * (1 - x_padding.float())[:, None, :]
x = self.linear(x.transpose(1, -1)) # [B, T, C]
x = x * (1 - x_padding.float())[:, :, None] # (B, T, C)
x = x[..., 0] # (B, Tmax)
return x
class LengthRegulator(torch.nn.Module):
def __init__(self, pad_value=0.0):
super(LengthRegulator, self).__init__()
self.pad_value = pad_value
def forward(self, dur, dur_padding=None, alpha=1.0):
"""
Example (no batch dim version):
1. dur = [2,2,3]
2. token_idx = [[1],[2],[3]], dur_cumsum = [2,4,7], dur_cumsum_prev = [0,2,4]
3. token_mask = [[1,1,0,0,0,0,0],
[0,0,1,1,0,0,0],
[0,0,0,0,1,1,1]]
4. token_idx * token_mask = [[1,1,0,0,0,0,0],
[0,0,2,2,0,0,0],
[0,0,0,0,3,3,3]]
5. (token_idx * token_mask).sum(0) = [1,1,2,2,3,3,3]
:param dur: Batch of durations of each frame (B, T_txt)
:param dur_padding: Batch of padding of each frame (B, T_txt)
:param alpha: duration rescale coefficient
:return:
mel2ph (B, T_speech)
assert alpha > 0
"""
dur = torch.round(dur.float() * alpha).long()
if dur_padding is not None:
dur = dur * (1 - dur_padding.long())
token_idx = torch.arange(1, dur.shape[1] + 1)[None, :, None].to(dur.device)
dur_cumsum = torch.cumsum(dur, 1)
dur_cumsum_prev = F.pad(dur_cumsum, [1, -1], mode='constant', value=0)
pos_idx = torch.arange(dur.sum(-1).max())[None, None].to(dur.device)
token_mask = (pos_idx >= dur_cumsum_prev[:, :, None]) & (pos_idx < dur_cumsum[:, :, None])
mel2token = (token_idx * token_mask.long()).sum(1)
return mel2token
class PitchPredictor(torch.nn.Module):
def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5, dropout_rate=0.1):
super(PitchPredictor, self).__init__()
self.conv = torch.nn.ModuleList()
self.kernel_size = kernel_size
for idx in range(n_layers):
in_chans = idim if idx == 0 else n_chans
self.conv += [torch.nn.Sequential(
torch.nn.Conv1d(in_chans, n_chans, kernel_size, padding=kernel_size // 2),
torch.nn.ReLU(),
LayerNorm(n_chans, dim=1),
torch.nn.Dropout(dropout_rate)
)]
self.linear = torch.nn.Linear(n_chans, odim)
def forward(self, x):
"""
:param x: [B, T, H]
:return: [B, T, H]
"""
x = x.transpose(1, -1) # (B, idim, Tmax)
for f in self.conv:
x = f(x) # (B, C, Tmax)
x = self.linear(x.transpose(1, -1)) # (B, Tmax, H)
return x
class EnergyPredictor(PitchPredictor):
pass
|