import torch from torch import nn from so_vits_svc_fork.modules import attentions as attentions from so_vits_svc_fork.modules import commons as commons from so_vits_svc_fork.modules import modules as modules class SpeakerEncoder(torch.nn.Module): def __init__( self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256, ): super().__init__() self.lstm = nn.LSTM( mel_n_channels, model_hidden_size, model_num_layers, batch_first=True ) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() def forward(self, mels): self.lstm.flatten_parameters() _, (hidden, _) = self.lstm(mels) embeds_raw = self.relu(self.linear(hidden[-1])) return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) def compute_partial_slices(self, total_frames, partial_frames, partial_hop): mel_slices = [] for i in range(0, total_frames - partial_frames, partial_hop): mel_range = torch.arange(i, i + partial_frames) mel_slices.append(mel_range) return mel_slices def embed_utterance(self, mel, partial_frames=128, partial_hop=64): mel_len = mel.size(1) last_mel = mel[:, -partial_frames:] if mel_len > partial_frames: mel_slices = self.compute_partial_slices( mel_len, partial_frames, partial_hop ) mels = list(mel[:, s] for s in mel_slices) mels.append(last_mel) mels = torch.stack(tuple(mels), 0).squeeze(1) with torch.no_grad(): partial_embeds = self(mels) embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) # embed = embed / torch.linalg.norm(embed, 2) else: with torch.no_grad(): embed = self(last_mel) return embed class Encoder(nn.Module): def __init__( self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.enc = modules.WN( hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, x, x_lengths, g=None): # print(x.shape,x_lengths.shape) x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( x.dtype ) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask return z, m, logs, x_mask class TextEncoder(nn.Module): def __init__( self, out_channels, hidden_channels, kernel_size, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None, ): super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.n_layers = n_layers self.gin_channels = gin_channels self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.f0_emb = nn.Embedding(256, hidden_channels) self.enc_ = attentions.Encoder( hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout ) def forward(self, x, x_mask, f0=None, noice_scale=1): x = x + self.f0_emb(f0).transpose(1, 2) x = self.enc_(x * x_mask, x_mask) stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask return z, m, logs, x_mask