styletts2

Sleeping

App Files Files Community

jonluca commited on Feb 12, 2024

Commit

1373f78

unverified ·

1 Parent(s): f5915fd

add precomputed voices, reformat code, remove unused code

Browse files

Files changed (42) hide show

.idea/.gitignore +8 -0
.idea/inspectionProfiles/Project_Default.xml +81 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/ruff.xml +11 -0
.idea/styletts2-personal.iml +12 -0
.idea/vcs.xml +6 -0
Modules/diffusion/diffusion.py +0 -6
Modules/diffusion/modules.py +4 -3
Modules/diffusion/sampler.py +7 -3
Modules/diffusion/utils.py +1 -4
Modules/discriminators.py +8 -7
Modules/hifigan.py +7 -28
Modules/istftnet.py +4 -3
Modules/slmadv.py +1 -3
Utils/ASR/layers.py +0 -4
Utils/ASR/models.py +0 -1
Utils/JDC/model.py +2 -2
Utils/PLBERT/util.py +0 -1
_run.py +1 -4
app.py +101 -143
compute.py +0 -6
ljspeechimportable.py +0 -225
losses.py +0 -1
meldataset.py +0 -6
models.py +26 -57
optimizers.py +0 -5
requirements.txt +3 -2
styletts2importable.py +197 -257
train_finetune.py +5 -10
train_first.py +3 -9
train_second.py +6 -11
utils.py +0 -7
voices/f-us-1.wav.npy +3 -0
voices/f-us-2.wav.npy +3 -0
voices/f-us-3.wav.npy +3 -0
voices/f-us-4.wav.npy +3 -0
voices/m-us-1.wav.npy +3 -0
voices/m-us-2.wav.npy +3 -0
voices/m-us-3.wav.npy +3 -0
voices/m-us-4.wav.npy +3 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,81 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
+    <inspection_tool class="PyCompatibilityInspection" enabled="true" level="ERROR" enabled_by_default="true" editorAttributes="ERRORS_ATTRIBUTES">
+      <option name="ourVersions">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="3.10" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="58">
+            <item index="0" class="java.lang.String" itemvalue="pandas" />
+            <item index="1" class="java.lang.String" itemvalue="fastapi" />
+            <item index="2" class="java.lang.String" itemvalue="pydantic" />
+            <item index="3" class="java.lang.String" itemvalue="clickhouse-connect" />
+            <item index="4" class="java.lang.String" itemvalue="uvicorn" />
+            <item index="5" class="java.lang.String" itemvalue="requests" />
+            <item index="6" class="java.lang.String" itemvalue="posthog" />
+            <item index="7" class="java.lang.String" itemvalue="numba" />
+            <item index="8" class="java.lang.String" itemvalue="faiss-cpu" />
+            <item index="9" class="java.lang.String" itemvalue="llvmlite" />
+            <item index="10" class="java.lang.String" itemvalue="tokenizers" />
+            <item index="11" class="java.lang.String" itemvalue="scipy" />
+            <item index="12" class="java.lang.String" itemvalue="transformers" />
+            <item index="13" class="java.lang.String" itemvalue="tornado" />
+            <item index="14" class="java.lang.String" itemvalue="threadpoolctl" />
+            <item index="15" class="java.lang.String" itemvalue="unidecode" />
+            <item index="16" class="java.lang.String" itemvalue="py-cpuinfo" />
+            <item index="17" class="java.lang.String" itemvalue="nbconvert" />
+            <item index="18" class="java.lang.String" itemvalue="tqdm" />
+            <item index="19" class="java.lang.String" itemvalue="appdirs" />
+            <item index="20" class="java.lang.String" itemvalue="rotary_embedding_torch" />
+            <item index="21" class="java.lang.String" itemvalue="deepspeed" />
+            <item index="22" class="java.lang.String" itemvalue="progressbar" />
+            <item index="23" class="java.lang.String" itemvalue="inflect" />
+            <item index="24" class="java.lang.String" itemvalue="librosa" />
+            <item index="25" class="java.lang.String" itemvalue="ffmpeg" />
+            <item index="26" class="java.lang.String" itemvalue="hjson" />
+            <item index="27" class="java.lang.String" itemvalue="einops" />
+            <item index="28" class="java.lang.String" itemvalue="torchaudio" />
+            <item index="29" class="java.lang.String" itemvalue="pyinstaller" />
+            <item index="30" class="java.lang.String" itemvalue="pytorch-lightning" />
+            <item index="31" class="java.lang.String" itemvalue="bitarray" />
+            <item index="32" class="java.lang.String" itemvalue="pyright" />
+            <item index="33" class="java.lang.String" itemvalue="yt-dlp" />
+            <item index="34" class="java.lang.String" itemvalue="torch" />
+            <item index="35" class="java.lang.String" itemvalue="torchvision" />
+            <item index="36" class="java.lang.String" itemvalue="sacrebleu" />
+            <item index="37" class="java.lang.String" itemvalue="aioshutil" />
+            <item index="38" class="java.lang.String" itemvalue="absl-py" />
+            <item index="39" class="java.lang.String" itemvalue="gradio" />
+            <item index="40" class="java.lang.String" itemvalue="matplotlib-inline" />
+            <item index="41" class="java.lang.String" itemvalue="Werkzeug" />
+            <item index="42" class="java.lang.String" itemvalue="fairseq" />
+            <item index="43" class="java.lang.String" itemvalue="json5" />
+            <item index="44" class="java.lang.String" itemvalue="torchfcpe" />
+            <item index="45" class="java.lang.String" itemvalue="numpy" />
+            <item index="46" class="java.lang.String" itemvalue="pyasn1" />
+            <item index="47" class="java.lang.String" itemvalue="torchcrepe" />
+            <item index="48" class="java.lang.String" itemvalue="pyasn1-modules" />
+            <item index="49" class="java.lang.String" itemvalue="tensorboard" />
+            <item index="50" class="java.lang.String" itemvalue="av" />
+            <item index="51" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="52" class="java.lang.String" itemvalue="tensorboardX" />
+            <item index="53" class="java.lang.String" itemvalue="uc-micro-py" />
+            <item index="54" class="java.lang.String" itemvalue="ffmpy" />
+            <item index="55" class="java.lang.String" itemvalue="pyworld" />
+            <item index="56" class="java.lang.String" itemvalue="Markdown" />
+            <item index="57" class="java.lang.String" itemvalue="praat-parselmouth" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="styletts2-personal" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="styletts2-personal" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/styletts2-personal.iml" filepath="$PROJECT_DIR$/.idea/styletts2-personal.iml" />
+    </modules>
+  </component>
+</project>

.idea/ruff.xml ADDED Viewed

	@@ -0,0 +1,11 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RuffConfigService">
+    <option name="disableOnSaveOutsideOfProject" value="false" />
+    <option name="globalRuffExecutablePath" value="/opt/homebrew/bin/ruff" />
+    <option name="globalRuffLspExecutablePath" value="/opt/homebrew/bin/ruff" />
+    <option name="projectRuffLspExecutablePath" value="/opt/homebrew/Caskroom/miniconda/base/envs/styletts2-personal/bin/ruff" />
+    <option name="runRuffOnSave" value="true" />
+    <option name="useRuffFormat" value="true" />
+  </component>
+</project>

.idea/styletts2-personal.iml ADDED Viewed

	@@ -0,0 +1,12 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="styletts2-personal" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

Modules/diffusion/diffusion.py CHANGED Viewed

@@ -1,11 +1,5 @@
-from math import pi
-from random import randint
-from typing import Any, Optional, Sequence, Tuple, Union
-import torch
-from einops import rearrange
 from torch import Tensor, nn
-from tqdm import tqdm
 from .utils import *
 from .sampler import *

 from torch import Tensor, nn
 from .utils import *
 from .sampler import *

Modules/diffusion/modules.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from math import floor, log, pi
-from typing import Any, List, Optional, Sequence, Tuple, Union
-from .utils import *
 import torch
 import torch.nn as nn
@@ -10,6 +10,7 @@ from einops.layers.torch import Rearrange
 from einops_exts import rearrange_many
 from torch import Tensor, einsum
 """
 Utils

+from math import log, pi
+from typing import Optional
+import torch.nn.functional as F
 import torch
 import torch.nn as nn
 from einops_exts import rearrange_many
 from torch import Tensor, einsum
+from Modules.diffusion.utils import default, exists, rand_bool
 """
 Utils

Modules/diffusion/sampler.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from math import atan, cos, pi, sin, sqrt
 from typing import Any, Callable, List, Optional, Tuple, Type
-import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange, reduce
 from torch import Tensor
 from .utils import *
@@ -437,7 +437,11 @@ class KarrasSampler(Sampler):
         # Denoise to sample
         for i in range(num_steps - 1):
             x = self.step(
-                x, fn=fn, sigma=sigmas[i], sigma_next=sigmas[i + 1], gamma=gammas[i]  # type: ignore # noqa
             )
         return x

 from math import atan, cos, pi, sin, sqrt
 from typing import Any, Callable, List, Optional, Tuple, Type
 import torch.nn as nn
 import torch.nn.functional as F
+from einops import rearrange
 from torch import Tensor
 from .utils import *
         # Denoise to sample
         for i in range(num_steps - 1):
             x = self.step(
+                x,
+                fn=fn,
+                sigma=sigmas[i],
+                sigma_next=sigmas[i + 1],
+                gamma=gammas[i],  # type: ignore # noqa
             )
         return x

Modules/diffusion/utils.py CHANGED Viewed

@@ -1,12 +1,9 @@
 from functools import reduce
 from inspect import isfunction
-from math import ceil, floor, log2, pi
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
 import torch
-import torch.nn.functional as F
-from einops import rearrange
-from torch import Generator, Tensor
 from typing_extensions import TypeGuard
 T = TypeVar("T")

 from functools import reduce
 from inspect import isfunction
+from math import ceil, floor, log2
 from typing import Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
 import torch
 from typing_extensions import TypeGuard
 T = TypeVar("T")

Modules/discriminators.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
-from torch.nn import Conv1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, spectral_norm
 from .utils import get_padding
@@ -21,8 +22,8 @@ def stft(x, fft_size, hop_size, win_length, window):
         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
     """
     x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=True)
-    real = x_stft[..., 0]
-    imag = x_stft[..., 1]
     return torch.abs(x_stft).transpose(2, 1)
@@ -39,7 +40,7 @@ class SpecDiscriminator(nn.Module):
         use_spectral_norm=False,
     ):
         super(SpecDiscriminator, self).__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
         self.fft_size = fft_size
         self.shift_size = shift_size
         self.win_length = win_length
@@ -123,7 +124,7 @@ class DiscriminatorP(torch.nn.Module):
     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
         super(DiscriminatorP, self).__init__()
         self.period = period
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
         self.convs = nn.ModuleList(
             [
                 norm_f(
@@ -225,7 +226,7 @@ class WavLMDiscriminator(nn.Module):
         self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False
     ):
         super(WavLMDiscriminator, self).__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
         self.pre = norm_f(
             Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0)
         )

 import torch
 import torch.nn.functional as F
 import torch.nn as nn
+from torch.nn import Conv1d, Conv2d
+from torch.nn.utils import spectral_norm
+from torch.nn.utils.parametrizations import weight_norm
 from .utils import get_padding
         Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
     """
     x_stft = torch.stft(x, fft_size, hop_size, win_length, window, return_complex=True)
+    x_stft[..., 0]
+    x_stft[..., 1]
     return torch.abs(x_stft).transpose(2, 1)
         use_spectral_norm=False,
     ):
         super(SpecDiscriminator, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
         self.fft_size = fft_size
         self.shift_size = shift_size
         self.win_length = win_length
     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
         super(DiscriminatorP, self).__init__()
         self.period = period
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
         self.convs = nn.ModuleList(
             [
                 norm_f(
         self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False
     ):
         super(WavLMDiscriminator, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
         self.pre = norm_f(
             Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0)
         )

Modules/hifigan.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from .utils import init_weights, get_padding
 import math
-import random
 import numpy as np
 LRELU_SLOPE = 0.1
@@ -269,7 +269,7 @@ class SineGen(torch.nn.Module):
         output sine_tensor: tensor(batchsize=1, length, dim)
         output uv: tensor(batchsize=1, length, 1)
         """
-        f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
         # fundamental component
         fn = torch.multiply(
             f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)
@@ -515,6 +515,7 @@ class AdainResBlk1d(nn.Module):
         self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
         self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
         self.norm1 = AdaIN1d(style_dim, dim_in)
         self.norm2 = AdaIN1d(style_dim, dim_out)
         if self.learned_sc:
             self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
@@ -581,6 +582,8 @@ class Decoder(nn.Module):
             nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)
         )
         self.N_conv = weight_norm(
             nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)
         )
@@ -599,30 +602,6 @@ class Decoder(nn.Module):
         )
     def forward(self, asr, F0_curve, N, s):
-        if self.training:
-            downlist = [0, 3, 7]
-            F0_down = downlist[random.randint(0, 2)]
-            downlist = [0, 3, 7, 15]
-            N_down = downlist[random.randint(0, 3)]
-            if F0_down:
-                F0_curve = (
-                    nn.functional.conv1d(
-                        F0_curve.unsqueeze(1),
-                        torch.ones(1, 1, F0_down).to("cuda"),
-                        padding=F0_down // 2,
-                    ).squeeze(1)
-                    / F0_down
-                )
-            if N_down:
-                N = (
-                    nn.functional.conv1d(
-                        N.unsqueeze(1),
-                        torch.ones(1, 1, N_down).to("cuda"),
-                        padding=N_down // 2,
-                    ).squeeze(1)
-                    / N_down
-                )
         F0 = self.F0_conv(F0_curve.unsqueeze(1))
         N = self.N_conv(N.unsqueeze(1))

 import torch
 import torch.nn.functional as F
 import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
 from .utils import init_weights, get_padding
 import math
 import numpy as np
 LRELU_SLOPE = 0.1
         output sine_tensor: tensor(batchsize=1, length, dim)
         output uv: tensor(batchsize=1, length, 1)
         """
+        torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
         # fundamental component
         fn = torch.multiply(
             f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)
         self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
         self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
         self.norm1 = AdaIN1d(style_dim, dim_in)
+        # self.norm1 = torch.compile(self.norm1)
         self.norm2 = AdaIN1d(style_dim, dim_out)
         if self.learned_sc:
             self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
             nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)
         )
+        # self.F0_conv = torch.compile(self.F0_conv)
         self.N_conv = weight_norm(
             nn.Conv1d(1, 1, kernel_size=3, stride=2, groups=1, padding=1)
         )
         )
     def forward(self, asr, F0_curve, N, s):
         F0 = self.F0_conv(F0_curve.unsqueeze(1))
         N = self.N_conv(N.unsqueeze(1))

Modules/istftnet.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from .utils import init_weights, get_padding
 import math
@@ -313,7 +314,7 @@ class SineGen(torch.nn.Module):
         output sine_tensor: tensor(batchsize=1, length, dim)
         output uv: tensor(batchsize=1, length, 1)
         """
-        f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
         # fundamental component
         fn = torch.multiply(
             f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)

 import torch
 import torch.nn.functional as F
 import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
 from .utils import init_weights, get_padding
 import math
         output sine_tensor: tensor(batchsize=1, length, dim)
         output uv: tensor(batchsize=1, length, 1)
         """
+        torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
         # fundamental component
         fn = torch.multiply(
             f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device)

Modules/slmadv.py CHANGED Viewed

@@ -67,7 +67,7 @@ class SLMAdversarialLoss(torch.nn.Module):
                 ).squeeze(1)
         s_dur = s_preds[:, 128:]
-        s = s_preds[:, :128]
         d, _ = self.model.predictor(
             d_en,
@@ -138,8 +138,6 @@ class SLMAdversarialLoss(torch.nn.Module):
         p_en = []
         sp = []
-        F0_fakes = []
-        N_fakes = []
         wav = []

                 ).squeeze(1)
         s_dur = s_preds[:, 128:]
+        s_preds[:, :128]
         d, _ = self.model.predictor(
             d_en,
         p_en = []
         sp = []
         wav = []

Utils/ASR/layers.py CHANGED Viewed

@@ -1,10 +1,6 @@
-import math
 import torch
 from torch import nn
-from typing import Optional, Any
-from torch import Tensor
 import torch.nn.functional as F
-import torchaudio
 import torchaudio.functional as audio_F
 import random

 import torch
 from torch import nn
 import torch.nn.functional as F
 import torchaudio.functional as audio_F
 import random

Utils/ASR/models.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import math
 import torch
 from torch import nn
-from torch.nn import TransformerEncoder
 import torch.nn.functional as F
 from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock

 import math
 import torch
 from torch import nn
 import torch.nn.functional as F
 from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock

Utils/JDC/model.py CHANGED Viewed

@@ -84,7 +84,7 @@ class JDCNet(nn.Module):
         self.apply(self.init_weights)
     def get_feature_GAN(self, x):
-        seq_len = x.shape[-2]
         x = x.float().transpose(-1, -2)
         convblock_out = self.conv_block(x)
@@ -98,7 +98,7 @@ class JDCNet(nn.Module):
         return poolblock_out.transpose(-1, -2)
     def get_feature(self, x):
-        seq_len = x.shape[-2]
         x = x.float().transpose(-1, -2)
         convblock_out = self.conv_block(x)

         self.apply(self.init_weights)
     def get_feature_GAN(self, x):
+        x.shape[-2]
         x = x.float().transpose(-1, -2)
         convblock_out = self.conv_block(x)
         return poolblock_out.transpose(-1, -2)
     def get_feature(self, x):
+        x.shape[-2]
         x = x.float().transpose(-1, -2)
         convblock_out = self.conv_block(x)

Utils/PLBERT/util.py CHANGED Viewed

@@ -20,7 +20,6 @@ def load_plbert(log_dir):
     albert_base_configuration = AlbertConfig(**plbert_config["model_params"])
     bert = CustomAlbert(albert_base_configuration)
-    files = os.listdir(log_dir)
     ckpts = []
     for f in os.listdir(log_dir):
         if f.startswith("step_"):

     albert_base_configuration = AlbertConfig(**plbert_config["model_params"])
     bert = CustomAlbert(albert_base_configuration)
     ckpts = []
     for f in os.listdir(log_dir):
         if f.startswith("step_"):

_run.py CHANGED Viewed

@@ -23,11 +23,8 @@ np.random.seed(0)
 import time
 import random
 import yaml
-from munch import Munch
 import numpy as np
 import torch
-from torch import nn
-import torch.nn.functional as F
 import torchaudio
 import librosa
 from nltk.tokenize import word_tokenize
@@ -305,7 +302,7 @@ def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=
         ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)
         ref_text_mask = length_to_mask(ref_input_lengths).to(device)
-        ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())
         s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
                                           embedding=bert_dur,
                                           embedding_scale=embedding_scale,

 import time
 import random
 import yaml
 import numpy as np
 import torch
 import torchaudio
 import librosa
 from nltk.tokenize import word_tokenize
         ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)
         ref_text_mask = length_to_mask(ref_input_lengths).to(device)
+        model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())
         s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
                                           embedding=bert_dur,
                                           embedding_scale=embedding_scale,

app.py CHANGED Viewed

@@ -1,48 +1,48 @@
-INTROTXT = """# StyleTTS 2
-[Paper](https://arxiv.org/abs/2306.07691) - [Samples](https://styletts2.github.io/) - [Code](https://github.com/yl4579/StyleTTS2) - [Discord](https://discord.gg/ha8sxdG2K4)
-A free demo of StyleTTS 2. **I am not affiliated with the StyleTTS 2 Authors.**
-**Before using this demo, you agree to inform the listeners that the speech samples are synthesized by the pre-trained models, unless you have the permission to use the voice you synthesize. That is, you agree to only use voices whose speakers grant the permission to have their voice cloned, either directly or by license before making synthesized voices public, or you have to publicly announce that these voices are synthesized if you do not have the permission to use these voices.**
-Is there a long queue on this space? Duplicate it and add a more powerful GPU to skip the wait! **Note: Thank you to Hugging Face for their generous GPU grant program!**
-**NOTE: StyleTTS 2 does better on longer texts.** For example, making it say "hi" will produce a lower-quality result than making it say a longer phrase.
-"""
 import gradio as gr
-import styletts2importable
-import ljspeechimportable
 import torch
-import os
 from txtsplit import txtsplit
 import numpy as np
-import pickle
 theme = gr.themes.Base(
-    font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
 )
-voicelist = ['f-us-1', 'f-us-2', 'f-us-3', 'f-us-4', 'm-us-1', 'm-us-2', 'm-us-3', 'm-us-4']
 voices = {}
-import phonemizer
-global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
-# todo: cache computed style, load using pickle
-# if os.path.exists('voices.pkl'):
-    # with open('voices.pkl', 'rb') as f:
-        # voices = pickle.load(f)
 # else:
 for v in voicelist:
-    voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')
-# def synthesize(text, voice, multispeakersteps):
-#     if text.strip() == "":
-#         raise gr.Error("You must enter some text")
-#     # if len(global_phonemizer.phonemize([text])) > 300:
-#     if len(text) > 300:
-#         raise gr.Error("Text must be under 300 characters")
-#     v = voice.lower()
-#     # return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1))
-#     return (24000, styletts2importable.inference(text, voices[v], alpha=0.3, beta=0.7, diffusion_steps=multispeakersteps, embedding_scale=1))
-if not torch.cuda.is_available(): INTROTXT += "\n\n### You are on a CPU-only system, inference will be much slower.\n\nYou can use the [online demo](https://huggingface.co/spaces/styletts2/styletts2) for fast inference."
-def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
@@ -53,123 +53,81 @@ def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
     texts = txtsplit(text)
     v = voice.lower()
     audios = []
-    for t in progress.tqdm(texts):
-        audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
-    return (24000, np.concatenate(audios))
-# def longsynthesize(text, voice, lngsteps, password, progress=gr.Progress()):
-#     if password == os.environ['ACCESS_CODE']:
-#         if text.strip() == "":
-#             raise gr.Error("You must enter some text")
-#         if lngsteps > 25:
-#             raise gr.Error("Max 25 steps")
-#         if lngsteps < 5:
-#             raise gr.Error("Min 5 steps")
-#         texts = split_and_recombine_text(text)
-#         v = voice.lower()
-#         audios = []
-#         for t in progress.tqdm(texts):
-#             audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
-#         return (24000, np.concatenate(audios))
-#     else:
-#         raise gr.Error('Wrong access code')
-def clsynthesize(text, voice, vcsteps, progress=gr.Progress()):
-    # if text.strip() == "":
-    #     raise gr.Error("You must enter some text")
-    # # if global_phonemizer.phonemize([text]) > 300:
-    # if len(text) > 400:
-    #     raise gr.Error("Text must be under 400 characters")
-    # # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=20, embedding_scale=1))
-    # return (24000, styletts2importable.inference(text, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
-    if text.strip() == "":
-        raise gr.Error("You must enter some text")
-    if len(text) > 50000:
-        raise gr.Error("Text must be <50k characters")
-    print("*** saying ***")
-    print(text)
-    print("*** end ***")
-    texts = txtsplit(text)
-    audios = []
-    for t in progress.tqdm(texts):
-        audios.append(styletts2importable.inference(t, styletts2importable.compute_style(voice), alpha=0.3, beta=0.7, diffusion_steps=vcsteps, embedding_scale=1))
-    return (24000, np.concatenate(audios))
-def ljsynthesize(text, steps, progress=gr.Progress()):
-    # if text.strip() == "":
-    #     raise gr.Error("You must enter some text")
-    # # if global_phonemizer.phonemize([text]) > 300:
-    # if len(text) > 400:
-    #     raise gr.Error("Text must be under 400 characters")
-    noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
-    # return (24000, ljspeechimportable.inference(text, noise, diffusion_steps=7, embedding_scale=1))
-    if text.strip() == "":
-        raise gr.Error("You must enter some text")
-    if len(text) > 150000:
-        raise gr.Error("Text must be <150k characters")
-    print("*** saying ***")
-    print(text)
-    print("*** end ***")
-    texts = txtsplit(text)
-    audios = []
-    for t in progress.tqdm(texts):
-        audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
     return (24000, np.concatenate(audios))
 with gr.Blocks() as vctk:
     with gr.Row():
         with gr.Column(scale=1):
-            inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
-            voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
-            multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
             # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
-            audio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
-            btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)
-with gr.Blocks() as clone:
-    with gr.Row():
-        with gr.Column(scale=1):
-            clinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
-            clvoice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=300, waveform_options={'waveform_progress_color': '#3C82F6'})
-            vcsteps = gr.Slider(minimum=3, maximum=20, value=20, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
-        with gr.Column(scale=1):
-            clbtn = gr.Button("Synthesize", variant="primary")
-            claudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
-            clbtn.click(clsynthesize, inputs=[clinp, clvoice, vcsteps], outputs=[claudio], concurrency_limit=4)
-# with gr.Blocks() as longText:
-#     with gr.Row():
-#         with gr.Column(scale=1):
-#             lnginp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
-#             lngvoice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-1', interactive=True)
-#             lngsteps = gr.Slider(minimum=5, maximum=25, value=10, step=1, label="Diffusion Steps", info="Higher = better quality, but slower", interactive=True)
-#             lngpwd = gr.Textbox(label="Access code", info="This feature is in beta. You need an access code to use it as it uses more resources and we would like to prevent abuse")
-#         with gr.Column(scale=1):
-#             lngbtn = gr.Button("Synthesize", variant="primary")
-#             lngaudio = gr.Audio(interactive=False, label="Synthesized Audio")
-#             lngbtn.click(longsynthesize, inputs=[lnginp, lngvoice, lngsteps, lngpwd], outputs=[lngaudio], concurrency_limit=4)
-with gr.Blocks() as lj:
-    with gr.Row():
-        with gr.Column(scale=1):
-            ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
-            ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
-        with gr.Column(scale=1):
-            ljbtn = gr.Button("Synthesize", variant="primary")
-            ljaudio = gr.Audio(interactive=False, label="Synthesized Audio", waveform_options={'waveform_progress_color': '#3C82F6'})
-            ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)
-with gr.Blocks(title="StyleTTS 2", css="footer{display:none !important}", theme=theme) as demo:
-    gr.Markdown(INTROTXT)
-    gr.DuplicateButton("Duplicate Space")
-    # gr.TabbedInterface([vctk, clone, lj, longText], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
-    gr.TabbedInterface([vctk, clone, lj], ['Multi-Voice', 'Voice Cloning', 'LJSpeech', 'Long Text [Beta]'])
-    gr.Markdown("""
-Demo by [mrfakename](https://twitter.com/realmrfakename). I am not affiliated with the StyleTTS 2 authors.
-Run this demo locally using Docker:
-```bash
-docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all registry.hf.space/styletts2-styletts2:latest python app.py
-```
-""") # Please do not remove this line.
 if __name__ == "__main__":
     # demo.queue(api_open=False, max_size=15).launch(show_api=False)
-    demo.queue(api_open=False, max_size=15).launch(show_api=False)

+import os
 import gradio as gr
 import torch
+from styletts2importable import compute_style, inference
 from txtsplit import txtsplit
 import numpy as np
+import phonemizer
 theme = gr.themes.Base(
+    font=[
+        gr.themes.GoogleFont("Libre Franklin"),
+        gr.themes.GoogleFont("Public Sans"),
+        "system-ui",
+        "sans-serif",
+    ],
 )
+voicelist = [
+    "f-us-1",
+    "f-us-2",
+    "f-us-3",
+    "f-us-4",
+    "m-us-1",
+    "m-us-2",
+    "m-us-3",
+    "m-us-4",
+]
 voices = {}
+global_phonemizer = phonemizer.backend.EspeakBackend(
+    language="en-us", preserve_punctuation=True, with_stress=True
+)
 # else:
 for v in voicelist:
+    cache_path = f"voices/{v}.wav.npy"
+    if os.path.exists(cache_path):
+        voices[v] = torch.from_numpy(np.load(cache_path))
+    else:
+        style = compute_style(f"voices/{v}.wav")
+        voices[v] = style
+        np.save(cache_path, style.cpu().numpy())
+def synthesize(text, voice, lngsteps):
     if text.strip() == "":
         raise gr.Error("You must enter some text")
     if len(text) > 50000:
     texts = txtsplit(text)
     v = voice.lower()
     audios = []
+    for t in texts:
+        audios.append(
+            inference(
+                t,
+                voices[v],
+                alpha=0.3,
+                beta=0.7,
+                diffusion_steps=lngsteps,
+                embedding_scale=1,
+            )
+        )
     return (24000, np.concatenate(audios))
 with gr.Blocks() as vctk:
     with gr.Row():
         with gr.Column(scale=1):
+            inp = gr.Textbox(
+                label="Text",
+                info="What would you like StyleTTS 2 to read? It works better on full sentences.",
+                interactive=True,
+            )
+            voice = gr.Dropdown(
+                voicelist,
+                label="Voice",
+                info="Select a default voice.",
+                value="m-us-2",
+                interactive=True,
+            )
+            multispeakersteps = gr.Slider(
+                minimum=3,
+                maximum=15,
+                value=3,
+                step=1,
+                label="Diffusion Steps",
+                info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster",
+                interactive=True,
+            )
             # use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
         with gr.Column(scale=1):
             btn = gr.Button("Synthesize", variant="primary")
+            audio = gr.Audio(
+                interactive=False,
+                label="Synthesized Audio",
+                waveform_options={"waveform_progress_color": "#3C82F6"},
+            )
+            btn.click(
+                synthesize,
+                inputs=[inp, voice, multispeakersteps],
+                outputs=[audio],
+                concurrency_limit=4,
+            )
+with gr.Blocks(
+    title="StyleTTS 2", css="footer{display:none !important}", theme=theme
+) as demo:
+    gr.TabbedInterface(
+        [vctk], ["Multi-Voice", "Voice Cloning", "LJSpeech", "Long Text [Beta]"]
+    )
 if __name__ == "__main__":
     # demo.queue(api_open=False, max_size=15).launch(show_api=False)
+    print("Launching")
+    # start_time = time.time()
+    # synthesize(
+    #     "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
+    #     "m-us-2",
+    #     3,
+    # )
+    # print(f"Launched in {time.time() - start_time} seconds")
+    # second_start_time = time.time()
+    # synthesize(
+    #     "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
+    #     "m-us-2",
+    #     3,
+    # )
+    # print(f"Launched in {time.time() - second_start_time} seconds")
+    demo.queue(api_open=True, max_size=None).launch(show_api=False)
+    print("Launched")

compute.py CHANGED Viewed

@@ -5,7 +5,6 @@ print("NLTK")
 import nltk
 nltk.download('punkt')
 print("SCIPY")
-from scipy.io.wavfile import write
 print("TORCH STUFF")
 import torch
 print("START")
@@ -20,17 +19,12 @@ import numpy as np
 np.random.seed(0)
 # load packages
-import time
 import random
 import yaml
-from munch import Munch
 import numpy as np
 import torch
-from torch import nn
-import torch.nn.functional as F
 import torchaudio
 import librosa
-from nltk.tokenize import word_tokenize
 from models import *
 from utils import *

 import nltk
 nltk.download('punkt')
 print("SCIPY")
 print("TORCH STUFF")
 import torch
 print("START")
 np.random.seed(0)
 # load packages
 import random
 import yaml
 import numpy as np
 import torch
 import torchaudio
 import librosa
 from models import *
 from utils import *

ljspeechimportable.py DELETED Viewed

@@ -1,225 +0,0 @@
-from cached_path import cached_path
-import torch
-torch.manual_seed(0)
-torch.backends.cudnn.benchmark = False
-torch.backends.cudnn.deterministic = True
-import random
-random.seed(0)
-import numpy as np
-np.random.seed(0)
-import nltk
-nltk.download('punkt')
-# load packages
-import time
-import random
-import yaml
-from munch import Munch
-import numpy as np
-import torch
-from torch import nn
-import torch.nn.functional as F
-import torchaudio
-import librosa
-from nltk.tokenize import word_tokenize
-from models import *
-from utils import *
-from text_utils import TextCleaner
-textclenaer = TextCleaner()
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-to_mel = torchaudio.transforms.MelSpectrogram(
-    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
-mean, std = -4, 4
-def length_to_mask(lengths):
-    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-    mask = torch.gt(mask+1, lengths.unsqueeze(1))
-    return mask
-def preprocess(wave):
-    wave_tensor = torch.from_numpy(wave).float()
-    mel_tensor = to_mel(wave_tensor)
-    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
-    return mel_tensor
-def compute_style(ref_dicts):
-    reference_embeddings = {}
-    for key, path in ref_dicts.items():
-        wave, sr = librosa.load(path, sr=24000)
-        audio, index = librosa.effects.trim(wave, top_db=30)
-        if sr != 24000:
-            audio = librosa.resample(audio, sr, 24000)
-        mel_tensor = preprocess(audio).to(device)
-        with torch.no_grad():
-            ref = model.style_encoder(mel_tensor.unsqueeze(1))
-        reference_embeddings[key] = (ref.squeeze(1), audio)
-    return reference_embeddings
-# load phonemizer
-import phonemizer
-global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True, words_mismatch='ignore')
-# phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
-config = yaml.safe_load(open(str(cached_path('hf://yl4579/StyleTTS2-LJSpeech/Models/LJSpeech/config.yml'))))
-# load pretrained ASR model
-ASR_config = config.get('ASR_config', False)
-ASR_path = config.get('ASR_path', False)
-text_aligner = load_ASR_models(ASR_path, ASR_config)
-# load pretrained F0 model
-F0_path = config.get('F0_path', False)
-pitch_extractor = load_F0_models(F0_path)
-# load BERT model
-from Utils.PLBERT.util import load_plbert
-BERT_path = config.get('PLBERT_dir', False)
-plbert = load_plbert(BERT_path)
-model = build_model(recursive_munch(config['model_params']), text_aligner, pitch_extractor, plbert)
-_ = [model[key].eval() for key in model]
-_ = [model[key].to(device) for key in model]
-# params_whole = torch.load("Models/LJSpeech/epoch_2nd_00100.pth", map_location='cpu')
-params_whole = torch.load(str(cached_path('hf://yl4579/StyleTTS2-LJSpeech/Models/LJSpeech/epoch_2nd_00100.pth')), map_location='cpu')
-params = params_whole['net']
-for key in model:
-    if key in params:
-        print('%s loaded' % key)
-        try:
-            model[key].load_state_dict(params[key])
-        except:
-            from collections import OrderedDict
-            state_dict = params[key]
-            new_state_dict = OrderedDict()
-            for k, v in state_dict.items():
-                name = k[7:] # remove `module.`
-                new_state_dict[name] = v
-            # load params
-            model[key].load_state_dict(new_state_dict, strict=False)
-#             except:
-#                 _load(params[key], model[key])
-_ = [model[key].eval() for key in model]
-from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
-sampler = DiffusionSampler(
-    model.diffusion.diffusion,
-    sampler=ADPM2Sampler(),
-    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
-    clamp=False
-)
-def inference(text, noise, diffusion_steps=5, embedding_scale=1):
-    text = text.strip()
-    text = text.replace('"', '')
-    ps = global_phonemizer.phonemize([text])
-    ps = word_tokenize(ps[0])
-    ps = ' '.join(ps)
-    tokens = textclenaer(ps)
-    tokens.insert(0, 0)
-    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
-    with torch.no_grad():
-        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
-        text_mask = length_to_mask(input_lengths).to(tokens.device)
-        t_en = model.text_encoder(tokens, input_lengths, text_mask)
-        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
-        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-        s_pred = sampler(noise,
-              embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,
-              embedding_scale=embedding_scale).squeeze(0)
-        s = s_pred[:, 128:]
-        ref = s_pred[:, :128]
-        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
-        x, _ = model.predictor.lstm(d)
-        duration = model.predictor.duration_proj(x)
-        duration = torch.sigmoid(duration).sum(axis=-1)
-        pred_dur = torch.round(duration.squeeze()).clamp(min=1)
-        pred_dur[-1] += 5
-        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
-        c_frame = 0
-        for i in range(pred_aln_trg.size(0)):
-            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
-            c_frame += int(pred_dur[i].data)
-        # encode prosody
-        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
-        out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),
-                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-    return out.squeeze().cpu().numpy()
-def LFinference(text, s_prev, noise, alpha=0.7, diffusion_steps=5, embedding_scale=1):
-  text = text.strip()
-  text = text.replace('"', '')
-  ps = global_phonemizer.phonemize([text])
-  ps = word_tokenize(ps[0])
-  ps = ' '.join(ps)
-  tokens = textclenaer(ps)
-  tokens.insert(0, 0)
-  tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
-  with torch.no_grad():
-      input_lengths = torch.LongTensor([tokens.shape[-1]]).to(tokens.device)
-      text_mask = length_to_mask(input_lengths).to(tokens.device)
-      t_en = model.text_encoder(tokens, input_lengths, text_mask)
-      bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
-      d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-      s_pred = sampler(noise,
-            embedding=bert_dur[0].unsqueeze(0), num_steps=diffusion_steps,
-            embedding_scale=embedding_scale).squeeze(0)
-      if s_prev is not None:
-          # convex combination of previous and current style
-          s_pred = alpha * s_prev + (1 - alpha) * s_pred
-      s = s_pred[:, 128:]
-      ref = s_pred[:, :128]
-      d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
-      x, _ = model.predictor.lstm(d)
-      duration = model.predictor.duration_proj(x)
-      duration = torch.sigmoid(duration).sum(axis=-1)
-      pred_dur = torch.round(duration.squeeze()).clamp(min=1)
-      pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
-      c_frame = 0
-      for i in range(pred_aln_trg.size(0)):
-          pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
-          c_frame += int(pred_dur[i].data)
-      # encode prosody
-      en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-      F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
-      out = model.decoder((t_en @ pred_aln_trg.unsqueeze(0).to(device)),
-                              F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-  return out.squeeze().cpu().numpy(), s_pred

losses.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import torch
-from torch import nn
 import torch.nn.functional as F
 import torchaudio
 from transformers import AutoModel

 import torch
 import torch.nn.functional as F
 import torchaudio
 from transformers import AutoModel

meldataset.py CHANGED Viewed

@@ -1,7 +1,5 @@
 # coding: utf-8
-import os
 import os.path as osp
-import time
 import random
 import numpy as np
 import random
@@ -9,8 +7,6 @@ import soundfile as sf
 import librosa
 import torch
-from torch import nn
-import torch.nn.functional as F
 import torchaudio
 from torch.utils.data import DataLoader
@@ -79,8 +75,6 @@ class FilePathDataset(torch.utils.data.Dataset):
         OOD_data="Data/OOD_texts.txt",
         min_length=50,
     ):
-        spect_params = SPECT_PARAMS
-        mel_params = MEL_PARAMS
         _data_list = [l[:-1].split("|") for l in data_list]
         self.data_list = [data if len(data) == 3 else (*data, 0) for data in _data_list]

 # coding: utf-8
 import os.path as osp
 import random
 import numpy as np
 import random
 import librosa
 import torch
 import torchaudio
 from torch.utils.data import DataLoader
         OOD_data="Data/OOD_texts.txt",
         min_length=50,
     ):
         _data_list = [l[:-1].split("|") for l in data_list]
         self.data_list = [data if len(data) == 3 else (*data, 0) for data in _data_list]

models.py CHANGED Viewed

@@ -1,22 +1,16 @@
 # coding:utf-8
-import os
-import os.path as osp
-import copy
 import math
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
 from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
-from Modules.diffusion.modules import Transformer1d, StyleTransformer1d
 from Modules.diffusion.diffusion import AudioDiffusionConditional
 from Modules.discriminators import (
@@ -27,6 +21,7 @@ from Modules.discriminators import (
 from munch import Munch
 import yaml
 class LearnedDownSample(nn.Module):
@@ -589,8 +584,8 @@ class ProsodyPredictor(nn.Module):
     def forward(self, texts, style, text_lengths, alignment, m):
         d = self.text_encoder(texts, style, text_lengths, m)
-        batch_size = d.shape[0]
-        text_size = d.shape[1]
         # predict duration
         input_lengths = text_lengths.cpu().numpy()
@@ -750,37 +745,19 @@ def load_ASR_models(ASR_MODEL_PATH, ASR_MODEL_CONFIG):
     return asr_model
-def build_model(args, text_aligner, pitch_extractor, bert):
-    assert args.decoder.type in ["istftnet", "hifigan"], "Decoder type unknown"
-    if args.decoder.type == "istftnet":
-        from Modules.istftnet import Decoder
-        decoder = Decoder(
-            dim_in=args.hidden_dim,
-            style_dim=args.style_dim,
-            dim_out=args.n_mels,
-            resblock_kernel_sizes=args.decoder.resblock_kernel_sizes,
-            upsample_rates=args.decoder.upsample_rates,
-            upsample_initial_channel=args.decoder.upsample_initial_channel,
-            resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
-            upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
-            gen_istft_n_fft=args.decoder.gen_istft_n_fft,
-            gen_istft_hop_size=args.decoder.gen_istft_hop_size,
-        )
-    else:
-        from Modules.hifigan import Decoder
-        decoder = Decoder(
-            dim_in=args.hidden_dim,
-            style_dim=args.style_dim,
-            dim_out=args.n_mels,
-            resblock_kernel_sizes=args.decoder.resblock_kernel_sizes,
-            upsample_rates=args.decoder.upsample_rates,
-            upsample_initial_channel=args.decoder.upsample_initial_channel,
-            resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
-            upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
-        )
     text_encoder = TextEncoder(
         channels=args.hidden_dim,
@@ -804,20 +781,12 @@ def build_model(args, text_aligner, pitch_extractor, bert):
         dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim
     )  # prosodic style encoder
-    # define diffusion model
-    if args.multispeaker:
-        transformer = StyleTransformer1d(
-            channels=args.style_dim * 2,
-            context_embedding_features=bert.config.hidden_size,
-            context_features=args.style_dim * 2,
-            **args.diffusion.transformer
-        )
-    else:
-        transformer = Transformer1d(
-            channels=args.style_dim * 2,
-            context_embedding_features=bert.config.hidden_size,
-            **args.diffusion.transformer
-        )
     diffusion = AudioDiffusionConditional(
         in_channels=1,
@@ -839,6 +808,8 @@ def build_model(args, text_aligner, pitch_extractor, bert):
     diffusion.diffusion.net = transformer
     diffusion.unet = transformer
     nets = Munch(
         bert=bert,
         bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
@@ -848,8 +819,6 @@ def build_model(args, text_aligner, pitch_extractor, bert):
         predictor_encoder=predictor_encoder,
         style_encoder=style_encoder,
         diffusion=diffusion,
-        text_aligner=text_aligner,
-        pitch_extractor=pitch_extractor,
         mpd=MultiPeriodDiscriminator(),
         msd=MultiResSpecDiscriminator(),
         # slm discriminator head

 # coding:utf-8
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn.utils import spectral_norm
+from torch.nn.utils.parametrizations import weight_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
 from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
+from Modules.diffusion.modules import StyleTransformer1d
 from Modules.diffusion.diffusion import AudioDiffusionConditional
 from Modules.discriminators import (
 from munch import Munch
 import yaml
+from Modules.hifigan import Decoder
 class LearnedDownSample(nn.Module):
     def forward(self, texts, style, text_lengths, alignment, m):
         d = self.text_encoder(texts, style, text_lengths, m)
+        d.shape[0]
+        d.shape[1]
         # predict duration
         input_lengths = text_lengths.cpu().numpy()
     return asr_model
+def build_model(args, bert):
+    decoder = Decoder(
+        dim_in=args.hidden_dim,
+        style_dim=args.style_dim,
+        dim_out=args.n_mels,
+        resblock_kernel_sizes=args.decoder.resblock_kernel_sizes,
+        upsample_rates=args.decoder.upsample_rates,
+        upsample_initial_channel=args.decoder.upsample_initial_channel,
+        resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
+        upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
+    )
+    # decoder = torch.compile(decoder, dynamic=True, backend="aot_eager")
     text_encoder = TextEncoder(
         channels=args.hidden_dim,
         dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim
     )  # prosodic style encoder
+    transformer = StyleTransformer1d(
+        channels=args.style_dim * 2,
+        context_embedding_features=bert.config.hidden_size,
+        context_features=args.style_dim * 2,
+        **args.diffusion.transformer,
+    )
     diffusion = AudioDiffusionConditional(
         in_channels=1,
     diffusion.diffusion.net = transformer
     diffusion.unet = transformer
+    # predictor = torch.compile(predictor)
     nets = Munch(
         bert=bert,
         bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
         predictor_encoder=predictor_encoder,
         style_encoder=style_encoder,
         diffusion=diffusion,
         mpd=MultiPeriodDiscriminator(),
         msd=MultiResSpecDiscriminator(),
         # slm discriminator head

optimizers.py CHANGED Viewed

@@ -1,10 +1,5 @@
 # coding:utf-8
-import os, sys
-import os.path as osp
-import numpy as np
 import torch
-from torch import nn
-from torch.optim import Optimizer
 from functools import reduce
 from torch.optim import AdamW

 # coding:utf-8
 import torch
 from functools import reduce
 from torch.optim import AdamW

requirements.txt CHANGED Viewed

@@ -1,13 +1,15 @@
 SoundFile
 torchaudio
 munch
-torch
 pydub
 pyyaml
 librosa
 nltk
 matplotlib
 accelerate
 transformers
 einops
 einops-exts
@@ -20,5 +22,4 @@ phonemizer
 cached-path
 gradio
 gruut
-#tortoise-tts
 txtsplit

 SoundFile
 torchaudio
 munch
+torch>=2.2.0
 pydub
 pyyaml
 librosa
 nltk
 matplotlib
 accelerate
+tokenizers>=0.14
+bottleneck>=1.3.6
 transformers
 einops
 einops-exts
 cached-path
 gradio
 gruut
 txtsplit

styletts2importable.py CHANGED Viewed

@@ -1,60 +1,57 @@
 from cached_path import cached_path
-# print("GRUUT")
-# from gruut_phonemize import gphonemize
-# from dp.phonemizer import Phonemizer
-print("NLTK")
 import nltk
-nltk.download('punkt')
-print("SCIPY")
-from scipy.io.wavfile import write
-print("TORCH STUFF")
-import torch
-print("START")
 torch.manual_seed(0)
 torch.backends.cudnn.benchmark = False
 torch.backends.cudnn.deterministic = True
-import random
-random.seed(0)
-import numpy as np
-np.random.seed(0)
-# load packages
-import time
-import random
-import yaml
-from munch import Munch
-import numpy as np
-import torch
-from torch import nn
-import torch.nn.functional as F
-import torchaudio
-import librosa
-from nltk.tokenize import word_tokenize
-from models import *
-from utils import *
-from text_utils import TextCleaner
-textclenaer = TextCleaner()
 to_mel = torchaudio.transforms.MelSpectrogram(
-    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
 mean, std = -4, 4
 def length_to_mask(lengths):
-    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-    mask = torch.gt(mask+1, lengths.unsqueeze(1))
     return mask
 def preprocess(wave):
     wave_tensor = torch.from_numpy(wave).float()
     mel_tensor = to_mel(wave_tensor)
     mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
     return mel_tensor
 def compute_style(path):
     wave, sr = librosa.load(path, sr=24000)
     audio, index = librosa.effects.trim(wave, top_db=30)
@@ -68,55 +65,151 @@ def compute_style(path):
     return torch.cat([ref_s, ref_p], dim=1)
-device = 'cpu'
 if torch.cuda.is_available():
-    device = 'cuda'
 elif torch.backends.mps.is_available():
     print("MPS would be available but cannot be used rn")
-    # device = 'mps'
-import phonemizer
-global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
-# phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
 # config = yaml.safe_load(open("Models/LibriTTS/config.yml"))
-config = yaml.safe_load(open(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/config.yml"))))
-# load pretrained ASR model
-ASR_config = config.get('ASR_config', False)
-ASR_path = config.get('ASR_path', False)
-text_aligner = load_ASR_models(ASR_path, ASR_config)
-# load pretrained F0 model
-F0_path = config.get('F0_path', False)
-pitch_extractor = load_F0_models(F0_path)
-# load BERT model
-from Utils.PLBERT.util import load_plbert
-BERT_path = config.get('PLBERT_dir', False)
 plbert = load_plbert(BERT_path)
-model_params = recursive_munch(config['model_params'])
-model = build_model(model_params, text_aligner, pitch_extractor, plbert)
 _ = [model[key].eval() for key in model]
 _ = [model[key].to(device) for key in model]
-# params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
-params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
-params = params_whole['net']
 for key in model:
     if key in params:
-        print('%s loaded' % key)
         try:
             model[key].load_state_dict(params[key])
         except:
             from collections import OrderedDict
             state_dict = params[key]
             new_state_dict = OrderedDict()
             for k, v in state_dict.items():
-                name = k[7:] # remove `module.`
                 new_state_dict[name] = v
             # load params
             model[key].load_state_dict(new_state_dict, strict=False)
@@ -124,181 +217,34 @@ for key in model:
 #                 _load(params[key], model[key])
 _ = [model[key].eval() for key in model]
-from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
 sampler = DiffusionSampler(
     model.diffusion.diffusion,
     sampler=ADPM2Sampler(),
-    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
-    clamp=False
 )
-def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
-    text = text.strip()
-    ps = global_phonemizer.phonemize([text])
-    ps = word_tokenize(ps[0])
-    ps = ' '.join(ps)
-    tokens = textclenaer(ps)
-    tokens.insert(0, 0)
-    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
-    with torch.no_grad():
-        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
-        text_mask = length_to_mask(input_lengths).to(device)
-        t_en = model.text_encoder(tokens, input_lengths, text_mask)
-        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
-        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
-                                          embedding=bert_dur,
-                                          embedding_scale=embedding_scale,
-                                            features=ref_s, # reference from the same speaker as the embedding
-                                             num_steps=diffusion_steps).squeeze(1)
-        s = s_pred[:, 128:]
-        ref = s_pred[:, :128]
-        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
-        s = beta * s + (1 - beta)  * ref_s[:, 128:]
-        d = model.predictor.text_encoder(d_en,
-                                         s, input_lengths, text_mask)
-        x, _ = model.predictor.lstm(d)
-        duration = model.predictor.duration_proj(x)
-        duration = torch.sigmoid(duration).sum(axis=-1)
-        pred_dur = torch.round(duration.squeeze()).clamp(min=1)
-        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
-        c_frame = 0
-        for i in range(pred_aln_trg.size(0)):
-            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
-            c_frame += int(pred_dur[i].data)
-        # encode prosody
-        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-        if model_params.decoder.type == "hifigan":
-            asr_new = torch.zeros_like(en)
-            asr_new[:, :, 0] = en[:, :, 0]
-            asr_new[:, :, 1:] = en[:, :, 0:-1]
-            en = asr_new
-        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
-        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
-        if model_params.decoder.type == "hifigan":
-            asr_new = torch.zeros_like(asr)
-            asr_new[:, :, 0] = asr[:, :, 0]
-            asr_new[:, :, 1:] = asr[:, :, 0:-1]
-            asr = asr_new
-        out = model.decoder(asr,
-                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later
-def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
-    text = text.strip()
-    ps = global_phonemizer.phonemize([text])
-    ps = word_tokenize(ps[0])
-    ps = ' '.join(ps)
-    ps = ps.replace('``', '"')
-    ps = ps.replace("''", '"')
-    tokens = textclenaer(ps)
-    tokens.insert(0, 0)
-    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
-    with torch.no_grad():
-        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
-        text_mask = length_to_mask(input_lengths).to(device)
-        t_en = model.text_encoder(tokens, input_lengths, text_mask)
-        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
-        d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
-                                        embedding=bert_dur,
-                                        embedding_scale=embedding_scale,
-                                            features=ref_s, # reference from the same speaker as the embedding
-                                            num_steps=diffusion_steps).squeeze(1)
-        if s_prev is not None:
-            # convex combination of previous and current style
-            s_pred = t * s_prev + (1 - t) * s_pred
-        s = s_pred[:, 128:]
-        ref = s_pred[:, :128]
-        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
-        s = beta * s + (1 - beta)  * ref_s[:, 128:]
-        s_pred = torch.cat([ref, s], dim=-1)
-        d = model.predictor.text_encoder(d_en,
-                                        s, input_lengths, text_mask)
-        x, _ = model.predictor.lstm(d)
-        duration = model.predictor.duration_proj(x)
-        duration = torch.sigmoid(duration).sum(axis=-1)
-        pred_dur = torch.round(duration.squeeze()).clamp(min=1)
-        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
-        c_frame = 0
-        for i in range(pred_aln_trg.size(0)):
-            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
-            c_frame += int(pred_dur[i].data)
-        # encode prosody
-        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-        if model_params.decoder.type == "hifigan":
-            asr_new = torch.zeros_like(en)
-            asr_new[:, :, 0] = en[:, :, 0]
-            asr_new[:, :, 1:] = en[:, :, 0:-1]
-            en = asr_new
-        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
-        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
-        if model_params.decoder.type == "hifigan":
-            asr_new = torch.zeros_like(asr)
-            asr_new[:, :, 0] = asr[:, :, 0]
-            asr_new[:, :, 1:] = asr[:, :, 0:-1]
-            asr = asr_new
-        out = model.decoder(asr,
-                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-    return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later
-def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1, use_gruut=False):
     text = text.strip()
     ps = global_phonemizer.phonemize([text])
     ps = word_tokenize(ps[0])
-    ps = ' '.join(ps)
-    tokens = textclenaer(ps)
     tokens.insert(0, 0)
     tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
-    ref_text = ref_text.strip()
-    ps = global_phonemizer.phonemize([ref_text])
-    ps = word_tokenize(ps[0])
-    ps = ' '.join(ps)
-    ref_tokens = textclenaer(ps)
-    ref_tokens.insert(0, 0)
-    ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
         text_mask = length_to_mask(input_lengths).to(device)
@@ -307,24 +253,21 @@ def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=
         bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
         d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
-        ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)
-        ref_text_mask = length_to_mask(ref_input_lengths).to(device)
-        ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())
-        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device),
-                                          embedding=bert_dur,
-                                          embedding_scale=embedding_scale,
-                                            features=ref_s, # reference from the same speaker as the embedding
-                                             num_steps=diffusion_steps).squeeze(1)
         s = s_pred[:, 128:]
         ref = s_pred[:, :128]
-        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]
-        s = beta * s + (1 - beta)  * ref_s[:, 128:]
-        d = model.predictor.text_encoder(d_en,
-                                         s, input_lengths, text_mask)
         x, _ = model.predictor.lstm(d)
         duration = model.predictor.duration_proj(x)
@@ -332,32 +275,29 @@ def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=
         duration = torch.sigmoid(duration).sum(axis=-1)
         pred_dur = torch.round(duration.squeeze()).clamp(min=1)
         pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
         c_frame = 0
         for i in range(pred_aln_trg.size(0)):
-            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
             c_frame += int(pred_dur[i].data)
         # encode prosody
-        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-        if model_params.decoder.type == "hifigan":
-            asr_new = torch.zeros_like(en)
-            asr_new[:, :, 0] = en[:, :, 0]
-            asr_new[:, :, 1:] = en[:, :, 0:-1]
-            en = asr_new
         F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
-        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))
-        if model_params.decoder.type == "hifigan":
-            asr_new = torch.zeros_like(asr)
-            asr_new[:, :, 0] = asr[:, :, 0]
-            asr_new[:, :, 1:] = asr[:, :, 0:-1]
-            asr = asr_new
-        out = model.decoder(asr,
-                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later

+import librosa
+import numpy as np
+import torch
+import torchaudio
 from cached_path import cached_path
+import random
 import nltk
+from models import build_model
+from text_utils import TextCleaner
+from nltk.tokenize import word_tokenize
+import phonemizer
+from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
+from utils import recursive_munch
+from Utils.PLBERT.util import load_plbert
+nltk.download("punkt")
+np.random.seed(0)
+random.seed(0)
 torch.manual_seed(0)
 torch.backends.cudnn.benchmark = False
 torch.backends.cudnn.deterministic = True
+global_phonemizer = phonemizer.backend.EspeakBackend(
+    language="en-us", preserve_punctuation=True, with_stress=True
+)
+textcleaner = TextCleaner()
 to_mel = torchaudio.transforms.MelSpectrogram(
+    n_mels=80, n_fft=2048, win_length=1200, hop_length=300
+)
 mean, std = -4, 4
 def length_to_mask(lengths):
+    mask = (
+        torch.arange(lengths.max())
+        .unsqueeze(0)
+        .expand(lengths.shape[0], -1)
+        .type_as(lengths)
+    )
+    mask = torch.gt(mask + 1, lengths.unsqueeze(1))
     return mask
 def preprocess(wave):
     wave_tensor = torch.from_numpy(wave).float()
     mel_tensor = to_mel(wave_tensor)
     mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
     return mel_tensor
 def compute_style(path):
     wave, sr = librosa.load(path, sr=24000)
     audio, index = librosa.effects.trim(wave, top_db=30)
     return torch.cat([ref_s, ref_p], dim=1)
+device = "cpu"
 if torch.cuda.is_available():
+    device = "cuda"
 elif torch.backends.mps.is_available():
     print("MPS would be available but cannot be used rn")
+    # device = "mps"
 # config = yaml.safe_load(open("Models/LibriTTS/config.yml"))
+config = {
+    "ASR_config": "Utils/ASR/config.yml",
+    "ASR_path": "Utils/ASR/epoch_00080.pth",
+    "F0_path": "Utils/JDC/bst.t7",
+    "PLBERT_dir": "Utils/PLBERT/",
+    "batch_size": 8,
+    "data_params": {
+        "OOD_data": "Data/OOD_texts.txt",
+        "min_length": 50,
+        "root_path": "",
+        "train_data": "Data/train_list.txt",
+        "val_data": "Data/val_list.txt",
+    },
+    "device": "cuda",
+    "epochs_1st": 40,
+    "epochs_2nd": 25,
+    "first_stage_path": "first_stage.pth",
+    "load_only_params": False,
+    "log_dir": "Models/LibriTTS",
+    "log_interval": 10,
+    "loss_params": {
+        "TMA_epoch": 4,
+        "diff_epoch": 0,
+        "joint_epoch": 0,
+        "lambda_F0": 1.0,
+        "lambda_ce": 20.0,
+        "lambda_diff": 1.0,
+        "lambda_dur": 1.0,
+        "lambda_gen": 1.0,
+        "lambda_mel": 5.0,
+        "lambda_mono": 1.0,
+        "lambda_norm": 1.0,
+        "lambda_s2s": 1.0,
+        "lambda_slm": 1.0,
+        "lambda_sty": 1.0,
+    },
+    "max_len": 300,
+    "model_params": {
+        "decoder": {
+            "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            "resblock_kernel_sizes": [3, 7, 11],
+            "type": "hifigan",
+            "upsample_initial_channel": 512,
+            "upsample_kernel_sizes": [20, 10, 6, 4],
+            "upsample_rates": [10, 5, 3, 2],
+        },
+        "diffusion": {
+            "dist": {
+                "estimate_sigma_data": True,
+                "mean": -3.0,
+                "sigma_data": 0.19926648961191362,
+                "std": 1.0,
+            },
+            "embedding_mask_proba": 0.1,
+            "transformer": {
+                "head_features": 64,
+                "multiplier": 2,
+                "num_heads": 8,
+                "num_layers": 3,
+            },
+        },
+        "dim_in": 64,
+        "dropout": 0,
+        "hidden_dim": 512,
+        "max_conv_dim": 512,
+        "max_dur": 50,
+        "multispeaker": True,
+        "n_layer": 3,
+        "n_mels": 80,
+        "n_token": 178,
+        "slm": {
+            "hidden": 768,
+            "initial_channel": 64,
+            "model": "microsoft/wavlm-base-plus",
+            "nlayers": 13,
+            "sr": 16000,
+        },
+        "style_dim": 128,
+    },
+    "optimizer_params": {"bert_lr": 1e-05, "ft_lr": 1e-05, "lr": 0.0001},
+    "preprocess_params": {
+        "spect_params": {"hop_length": 300, "n_fft": 2048, "win_length": 1200},
+        "sr": 24000,
+    },
+    "pretrained_model": "Models/LibriTTS/epoch_2nd_00002.pth",
+    "save_freq": 1,
+    "second_stage_load_pretrained": True,
+    "slmadv_params": {
+        "batch_percentage": 0.5,
+        "iter": 20,
+        "max_len": 500,
+        "min_len": 400,
+        "scale": 0.01,
+        "sig": 1.5,
+        "thresh": 5,
+    },
+}
+BERT_path = config.get("PLBERT_dir", False)
 plbert = load_plbert(BERT_path)
+model_params = recursive_munch(config["model_params"])
+model = build_model(model_params, plbert)
 _ = [model[key].eval() for key in model]
 _ = [model[key].to(device) for key in model]
+# for key in model:
+#     print(f"Compiling {key}")
+#     model[key] = torch.compile(model[key])
+#     print(f"Compiled {key}")
+params_whole = torch.load(
+    str(
+        cached_path(
+            "hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth"
+        )
+    ),
+    map_location="cpu",
+)
+params = params_whole["net"]
 for key in model:
     if key in params:
+        print("%s loaded" % key)
         try:
             model[key].load_state_dict(params[key])
         except:
             from collections import OrderedDict
             state_dict = params[key]
             new_state_dict = OrderedDict()
             for k, v in state_dict.items():
+                name = k[7:]  # remove `module.`
                 new_state_dict[name] = v
             # load params
             model[key].load_state_dict(new_state_dict, strict=False)
 #                 _load(params[key], model[key])
 _ = [model[key].eval() for key in model]
 sampler = DiffusionSampler(
     model.diffusion.diffusion,
     sampler=ADPM2Sampler(),
+    sigma_schedule=KarrasSchedule(
+        sigma_min=0.0001, sigma_max=3.0, rho=9.0
+    ),  # empirical parameters
+    clamp=False,
 )
+def inference(
+    text,
+    ref_s,
+    alpha=0.3,
+    beta=0.7,
+    diffusion_steps=5,
+    embedding_scale=1,
+    use_gruut=False,
+):
     text = text.strip()
     ps = global_phonemizer.phonemize([text])
     ps = word_tokenize(ps[0])
+    ps = " ".join(ps)
+    tokens = textcleaner(ps)
     tokens.insert(0, 0)
     tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
         text_mask = length_to_mask(input_lengths).to(device)
         bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
         d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+        s_pred = sampler(
+            noise=torch.randn((1, 256)).unsqueeze(1).to(device),
+            embedding=bert_dur,
+            embedding_scale=embedding_scale,
+            features=ref_s,  # reference from the same speaker as the embedding
+            num_steps=diffusion_steps,
+        ).squeeze(1)
         s = s_pred[:, 128:]
         ref = s_pred[:, :128]
+        ref = alpha * ref + (1 - alpha) * ref_s[:, :128]
+        s = beta * s + (1 - beta) * ref_s[:, 128:]
+        d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
         x, _ = model.predictor.lstm(d)
         duration = model.predictor.duration_proj(x)
         duration = torch.sigmoid(duration).sum(axis=-1)
         pred_dur = torch.round(duration.squeeze()).clamp(min=1)
         pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
         c_frame = 0
         for i in range(pred_aln_trg.size(0)):
+            pred_aln_trg[i, c_frame : c_frame + int(pred_dur[i].data)] = 1
             c_frame += int(pred_dur[i].data)
         # encode prosody
+        en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
+        asr_new = torch.zeros_like(en)
+        asr_new[:, :, 0] = en[:, :, 0]
+        asr_new[:, :, 1:] = en[:, :, 0:-1]
+        en = asr_new
         F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+        asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
+        asr_new = torch.zeros_like(asr)
+        asr_new[:, :, 0] = asr[:, :, 0]
+        asr_new[:, :, 1:] = asr[:, :, 0:-1]
+        asr = asr_new
+        out = model.decoder(asr, F0_pred, N_pred, ref.squeeze().unsqueeze(0))
+    return (
+        out.squeeze().cpu().numpy()[..., :-50]
+    )  # weird pulse at the end of the model, need to be fixed later

train_finetune.py CHANGED Viewed

@@ -7,8 +7,6 @@ import numpy as np
 import torch
 from torch import nn
 import torch.nn.functional as F
-import torchaudio
-import librosa
 import click
 import shutil
 import warnings
@@ -18,8 +16,6 @@ from torch.utils.tensorboard import SummaryWriter
 from meldataset import build_dataloader
-from Utils.ASR.models import ASRCNN
-from Utils.JDC.model import JDCNet
 from Utils.PLBERT.util import load_plbert
 from models import *
@@ -75,7 +71,7 @@ def main(config_path):
     epochs = config.get("epochs", 200)
     save_freq = config.get("save_freq", 2)
     log_interval = config.get("log_interval", 10)
-    saving_epoch = config.get("save_freq", 2)
     data_params = config.get("data_params", None)
     sr = config["preprocess_params"].get("sr", 24000)
@@ -245,11 +241,11 @@ def main(config_path):
     n_down = model.text_aligner.n_down
     best_loss = float("inf")  # best test loss
-    loss_train_record = list([])
-    loss_test_record = list([])
     iters = 0
-    criterion = nn.L1Loss()  # F0 loss (regression)
     torch.cuda.empty_cache()
     stft_loss = MultiResolutionSTFTLoss().to(device)
@@ -257,7 +253,6 @@ def main(config_path):
     print("BERT", optimizer.optimizers["bert"])
     print("decoder", optimizer.optimizers["decoder"])
-    start_ds = False
     running_std = []
@@ -302,7 +297,7 @@ def main(config_path):
             ) = batch
             with torch.no_grad():
                 mask = length_to_mask(mel_input_length // (2**n_down)).to(device)
-                mel_mask = length_to_mask(mel_input_length).to(device)
                 text_mask = length_to_mask(input_lengths).to(texts.device)
                 # compute reference styles

 import torch
 from torch import nn
 import torch.nn.functional as F
 import click
 import shutil
 import warnings
 from meldataset import build_dataloader
 from Utils.PLBERT.util import load_plbert
 from models import *
     epochs = config.get("epochs", 200)
     save_freq = config.get("save_freq", 2)
     log_interval = config.get("log_interval", 10)
+    config.get("save_freq", 2)
     data_params = config.get("data_params", None)
     sr = config["preprocess_params"].get("sr", 24000)
     n_down = model.text_aligner.n_down
     best_loss = float("inf")  # best test loss
+    list([])
+    list([])
     iters = 0
+    nn.L1Loss()  # F0 loss (regression)
     torch.cuda.empty_cache()
     stft_loss = MultiResolutionSTFTLoss().to(device)
     print("BERT", optimizer.optimizers["bert"])
     print("decoder", optimizer.optimizers["decoder"])
     running_std = []
             ) = batch
             with torch.no_grad():
                 mask = length_to_mask(mel_input_length // (2**n_down)).to(device)
+                length_to_mask(mel_input_length).to(device)
                 text_mask = length_to_mask(input_lengths).to(texts.device)
                 # compute reference styles

train_first.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import os
 import os.path as osp
-import re
-import sys
 import yaml
 import shutil
 import numpy as np
@@ -17,10 +15,7 @@ import yaml
 from munch import Munch
 import numpy as np
 import torch
-from torch import nn
 import torch.nn.functional as F
-import torchaudio
-import librosa
 from models import *
 from meldataset import build_dataloader
@@ -30,7 +25,6 @@ from optimizers import build_optimizer
 import time
 from accelerate import Accelerator
-from accelerate.utils import LoggerType
 from accelerate import DistributedDataParallelKwargs
 from torch.utils.tensorboard import SummaryWriter
@@ -69,7 +63,7 @@ def main(config_path):
     device = accelerator.device
     epochs = config.get("epochs_1st", 200)
-    save_freq = config.get("save_freq", 2)
     log_interval = config.get("log_interval", 10)
     saving_epoch = config.get("save_freq", 2)
@@ -137,8 +131,8 @@ def main(config_path):
     model = build_model(model_params, text_aligner, pitch_extractor, plbert)
     best_loss = float("inf")  # best test loss
-    loss_train_record = list([])
-    loss_test_record = list([])
     loss_params = Munch(config["loss_params"])
     TMA_epoch = loss_params.TMA_epoch

 import os
 import os.path as osp
 import yaml
 import shutil
 import numpy as np
 from munch import Munch
 import numpy as np
 import torch
 import torch.nn.functional as F
 from models import *
 from meldataset import build_dataloader
 import time
 from accelerate import Accelerator
 from accelerate import DistributedDataParallelKwargs
 from torch.utils.tensorboard import SummaryWriter
     device = accelerator.device
     epochs = config.get("epochs_1st", 200)
+    config.get("save_freq", 2)
     log_interval = config.get("log_interval", 10)
     saving_epoch = config.get("save_freq", 2)
     model = build_model(model_params, text_aligner, pitch_extractor, plbert)
     best_loss = float("inf")  # best test loss
+    list([])
+    list([])
     loss_params = Munch(config["loss_params"])
     TMA_epoch = loss_params.TMA_epoch

train_second.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # load packages
-import random
 import yaml
 import time
 from munch import Munch
@@ -7,8 +6,6 @@ import numpy as np
 import torch
 from torch import nn
 import torch.nn.functional as F
-import torchaudio
-import librosa
 import click
 import shutil
 import warnings
@@ -18,8 +15,6 @@ from torch.utils.tensorboard import SummaryWriter
 from meldataset import build_dataloader
-from Utils.ASR.models import ASRCNN
-from Utils.JDC.model import JDCNet
 from Utils.PLBERT.util import load_plbert
 from models import *
@@ -73,7 +68,7 @@ def main(config_path):
     batch_size = config.get("batch_size", 10)
     epochs = config.get("epochs_2nd", 200)
-    save_freq = config.get("save_freq", 2)
     log_interval = config.get("log_interval", 10)
     saving_epoch = config.get("save_freq", 2)
@@ -245,11 +240,11 @@ def main(config_path):
     n_down = model.text_aligner.n_down
     best_loss = float("inf")  # best test loss
-    loss_train_record = list([])
-    loss_test_record = list([])
     iters = 0
-    criterion = nn.L1Loss()  # F0 loss (regression)
     torch.cuda.empty_cache()
     stft_loss = MultiResolutionSTFTLoss().to(device)
@@ -303,7 +298,7 @@ def main(config_path):
             with torch.no_grad():
                 mask = length_to_mask(mel_input_length // (2**n_down)).to(device)
-                mel_mask = length_to_mask(mel_input_length).to(device)
                 text_mask = length_to_mask(input_lengths).to(texts.device)
                 try:
@@ -445,7 +440,7 @@ def main(config_path):
                 F0_real, _, F0 = model.pitch_extractor(gt.unsqueeze(1))
                 F0 = F0.reshape(F0.shape[0], F0.shape[1] * 2, F0.shape[2], 1).squeeze()
-                asr_real = model.text_aligner.get_feature(gt)
                 N_real = log_norm(gt.unsqueeze(1)).squeeze(1)

 # load packages
 import yaml
 import time
 from munch import Munch
 import torch
 from torch import nn
 import torch.nn.functional as F
 import click
 import shutil
 import warnings
 from meldataset import build_dataloader
 from Utils.PLBERT.util import load_plbert
 from models import *
     batch_size = config.get("batch_size", 10)
     epochs = config.get("epochs_2nd", 200)
+    config.get("save_freq", 2)
     log_interval = config.get("log_interval", 10)
     saving_epoch = config.get("save_freq", 2)
     n_down = model.text_aligner.n_down
     best_loss = float("inf")  # best test loss
+    list([])
+    list([])
     iters = 0
+    nn.L1Loss()  # F0 loss (regression)
     torch.cuda.empty_cache()
     stft_loss = MultiResolutionSTFTLoss().to(device)
             with torch.no_grad():
                 mask = length_to_mask(mel_input_length // (2**n_down)).to(device)
+                length_to_mask(mel_input_length).to(device)
                 text_mask = length_to_mask(input_lengths).to(texts.device)
                 try:
                 F0_real, _, F0 = model.pitch_extractor(gt.unsqueeze(1))
                 F0 = F0.reshape(F0.shape[0], F0.shape[1] * 2, F0.shape[2], 1).squeeze()
+                model.text_aligner.get_feature(gt)
                 N_real = log_norm(gt.unsqueeze(1)).squeeze(1)

utils.py CHANGED Viewed

@@ -1,13 +1,6 @@
-from monotonic_align import maximum_path
-from monotonic_align import mask_from_lens
 from monotonic_align.core import maximum_path_c
 import numpy as np
 import torch
-import copy
-from torch import nn
-import torch.nn.functional as F
-import torchaudio
-import librosa
 import matplotlib.pyplot as plt
 from munch import Munch

 from monotonic_align.core import maximum_path_c
 import numpy as np
 import torch
 import matplotlib.pyplot as plt
 from munch import Munch

voices/f-us-1.wav.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24012e9cefccf44b9187cb7e61907eac7120e96115f77b922c74c0e36b5b45f6
+size 1152

voices/f-us-2.wav.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25df85a2fb487a2e55189fbd02173c7b84028b0cbdb056aaa61d0f853136ebba
+size 1152

voices/f-us-3.wav.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad8ce27bfbe1d967d3b5f5f0894b6f3d899c1f23dec5a85157318ecb719eab7
+size 1152

voices/f-us-4.wav.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:397ead679dbd859550cfe2a2635d5ddc78c0b400cd434fdd0dd41cac88ceb667
+size 1152

voices/m-us-1.wav.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d22a0041c44675a8e2d8b9830a3261f5359e31a8418ea5ef19f9ba76bda2c13
+size 1152

voices/m-us-2.wav.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e09851ffb127fbd3a17329b8d68a27dec5f5920fd5f1aaa7b871046552a2c902
+size 1152

voices/m-us-3.wav.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98ff10843ecb12f0fe4c31c9309da6f594ed2dd7248d163c927fda05dd608336
+size 1152

voices/m-us-4.wav.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:275b5b956cef2e6eff4258fabb8fffdab130d5e858fd826258fd76e46296263d
+size 1152