Flux9665 commited on
Commit
6a79837
·
1 Parent(s): b5805eb

update to current version

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. InferenceInterfaces/ControllableInterface.py +52 -7
  2. InferenceInterfaces/ToucanTTSInterface.py +7 -7
  3. InferenceInterfaces/UtteranceCloner.py +4 -4
  4. Models/ToucanTTS_Meta/best.pt +0 -3
  5. Models/Vocoder/best.pt +0 -3
  6. {Architectures → Modules}/Aligner/Aligner.py +0 -0
  7. {Architectures → Modules}/Aligner/CodecAlignerDataset.py +0 -0
  8. {Architectures → Modules}/Aligner/README.md +0 -0
  9. {Architectures → Modules}/Aligner/Reconstructor.py +0 -0
  10. {Architectures → Modules}/Aligner/__init__.py +0 -0
  11. {Architectures → Modules}/Aligner/autoaligner_train_loop.py +2 -2
  12. {Architectures → Modules}/ControllabilityGAN/GAN.py +1 -1
  13. {Architectures → Modules}/ControllabilityGAN/__init__.py +0 -0
  14. {Architectures → Modules}/ControllabilityGAN/dataset/__init__.py +0 -0
  15. {Architectures → Modules}/ControllabilityGAN/dataset/speaker_embeddings_dataset.py +0 -0
  16. {Architectures → Modules}/ControllabilityGAN/wgan/__init__.py +0 -0
  17. {Architectures → Modules}/ControllabilityGAN/wgan/init_weights.py +0 -0
  18. {Architectures → Modules}/ControllabilityGAN/wgan/init_wgan.py +2 -2
  19. {Architectures → Modules}/ControllabilityGAN/wgan/resnet_1.py +0 -0
  20. {Architectures → Modules}/ControllabilityGAN/wgan/resnet_init.py +4 -4
  21. {Architectures → Modules}/ControllabilityGAN/wgan/wgan_qc.py +0 -0
  22. {Architectures → Modules}/EmbeddingModel/GST.py +1 -1
  23. {Architectures → Modules}/EmbeddingModel/README.md +0 -0
  24. {Architectures → Modules}/EmbeddingModel/StyleEmbedding.py +2 -2
  25. {Architectures → Modules}/EmbeddingModel/StyleTTSEncoder.py +0 -0
  26. {Architectures → Modules}/EmbeddingModel/__init__.py +0 -0
  27. {Architectures → Modules}/GeneralLayers/Attention.py +0 -0
  28. {Architectures → Modules}/GeneralLayers/ConditionalLayerNorm.py +0 -1
  29. {Architectures → Modules}/GeneralLayers/Conformer.py +27 -17
  30. {Architectures → Modules}/GeneralLayers/Convolution.py +0 -0
  31. {Architectures → Modules}/GeneralLayers/DurationPredictor.py +3 -3
  32. {Architectures → Modules}/GeneralLayers/EncoderLayer.py +1 -1
  33. {Architectures → Modules}/GeneralLayers/LayerNorm.py +0 -0
  34. {Architectures → Modules}/GeneralLayers/LengthRegulator.py +0 -0
  35. {Architectures → Modules}/GeneralLayers/MultiLayeredConv1d.py +0 -0
  36. {Architectures → Modules}/GeneralLayers/MultiSequential.py +0 -0
  37. {Architectures → Modules}/GeneralLayers/PositionalEncoding.py +0 -0
  38. {Architectures → Modules}/GeneralLayers/PositionwiseFeedForward.py +0 -0
  39. {Architectures → Modules}/GeneralLayers/README.md +0 -0
  40. {Architectures → Modules}/GeneralLayers/ResidualBlock.py +0 -0
  41. {Architectures → Modules}/GeneralLayers/ResidualStack.py +0 -0
  42. {Architectures → Modules}/GeneralLayers/STFT.py +0 -0
  43. {Architectures → Modules}/GeneralLayers/Swish.py +0 -0
  44. {Architectures → Modules}/GeneralLayers/VariancePredictor.py +3 -3
  45. {Architectures → Modules}/GeneralLayers/__init__.py +0 -0
  46. {Architectures → Modules}/README.md +0 -0
  47. {Architectures → Modules}/ToucanTTS/CodecDiscriminator.py +0 -0
  48. {Architectures → Modules}/ToucanTTS/CodecRefinementTransformer.py +2 -2
  49. {Architectures → Modules}/ToucanTTS/DurationCalculator.py +0 -0
  50. {Architectures → Modules}/ToucanTTS/EnergyCalculator.py +1 -1
InferenceInterfaces/ControllableInterface.py CHANGED
@@ -2,8 +2,8 @@ import os
2
 
3
  import torch
4
 
5
- from Architectures.ControllabilityGAN.GAN import GanWrapper
6
  from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
 
7
  from Utility.storage_config import MODELS_DIR
8
 
9
 
@@ -16,14 +16,18 @@ class ControllableInterface:
16
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
17
  os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
18
  self.device = "cuda" if gpu_id != "cpu" else "cpu"
19
- self.model = ToucanTTSInterface(device=self.device, tts_model_path="Meta", language="eng")
20
  self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device)
21
  self.generated_speaker_embeds = list()
22
  self.available_artificial_voices = available_artificial_voices
 
 
23
 
24
  def read(self,
25
  prompt,
26
- audio,
 
 
27
  voice_seed,
28
  prosody_creativity,
29
  duration_scaling_factor,
@@ -38,7 +42,15 @@ class ControllableInterface:
38
  emb_slider_6,
39
  loudness_in_db
40
  ):
41
- if audio is None:
 
 
 
 
 
 
 
 
42
  self.wgan.set_latent(voice_seed)
43
  controllability_vector = torch.tensor([emb_slider_1,
44
  emb_slider_2,
@@ -49,13 +61,46 @@ class ControllableInterface:
49
  embedding = self.wgan.modify_embed(controllability_vector)
50
  self.model.set_utterance_embedding(embedding=embedding)
51
  else:
52
- self.model.set_utterance_embedding(path_to_reference_audio=audio)
53
 
54
  phones = self.model.text2phone.get_phone_string(prompt)
55
  if len(phones) > 1800:
56
- prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- print(prompt)
59
  wav, sr, fig = self.model(prompt,
60
  input_is_phones=False,
61
  duration_scaling_factor=duration_scaling_factor,
 
2
 
3
  import torch
4
 
 
5
  from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
6
+ from Modules.ControllabilityGAN.GAN import GanWrapper
7
  from Utility.storage_config import MODELS_DIR
8
 
9
 
 
16
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
17
  os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
18
  self.device = "cuda" if gpu_id != "cpu" else "cpu"
19
+ self.model = ToucanTTSInterface(device=self.device, tts_model_path="Meta")
20
  self.wgan = GanWrapper(os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device)
21
  self.generated_speaker_embeds = list()
22
  self.available_artificial_voices = available_artificial_voices
23
+ self.current_language = ""
24
+ self.current_accent = ""
25
 
26
  def read(self,
27
  prompt,
28
+ reference_audio,
29
+ language,
30
+ accent,
31
  voice_seed,
32
  prosody_creativity,
33
  duration_scaling_factor,
 
42
  emb_slider_6,
43
  loudness_in_db
44
  ):
45
+ if self.current_language != language:
46
+ self.model.set_phonemizer_language(language)
47
+ print(f"switched phonemizer language to {language}")
48
+ self.current_language = language
49
+ if self.current_accent != accent:
50
+ self.model.set_accent_language(accent)
51
+ print(f"switched accent language to {accent}")
52
+ self.current_accent = accent
53
+ if reference_audio is None:
54
  self.wgan.set_latent(voice_seed)
55
  controllability_vector = torch.tensor([emb_slider_1,
56
  emb_slider_2,
 
61
  embedding = self.wgan.modify_embed(controllability_vector)
62
  self.model.set_utterance_embedding(embedding=embedding)
63
  else:
64
+ self.model.set_utterance_embedding(reference_audio)
65
 
66
  phones = self.model.text2phone.get_phone_string(prompt)
67
  if len(phones) > 1800:
68
+ if language == "deu":
69
+ prompt = "Deine Eingabe war zu lang. Bitte versuche es entweder mit einem kürzeren Text oder teile ihn in mehrere Teile auf."
70
+ elif language == "ell":
71
+ prompt = "Η εισήγησή σας ήταν πολύ μεγάλη. Παρακαλώ δοκιμάστε είτε ένα μικρότερο κείμενο είτε χωρίστε το σε διάφορα μέρη."
72
+ elif language == "spa":
73
+ prompt = "Su entrada es demasiado larga. Por favor, intente un texto más corto o divídalo en varias partes."
74
+ elif language == "fin":
75
+ prompt = "Vastauksesi oli liian pitkä. Kokeile joko lyhyempää tekstiä tai jaa se useampaan osaan."
76
+ elif language == "rus":
77
+ prompt = "Ваш текст слишком длинный. Пожалуйста, попробуйте либо сократить текст, либо разделить его на несколько частей."
78
+ elif language == "hun":
79
+ prompt = "Túl hosszú volt a bevitele. Kérjük, próbáljon meg rövidebb szöveget írni, vagy ossza több részre."
80
+ elif language == "nld":
81
+ prompt = "Uw input was te lang. Probeer een kortere tekst of splits het in verschillende delen."
82
+ elif language == "fra":
83
+ prompt = "Votre saisie était trop longue. Veuillez essayer un texte plus court ou le diviser en plusieurs parties."
84
+ elif language == 'pol':
85
+ prompt = "Twój wpis był zbyt długi. Spróbuj skrócić tekst lub podzielić go na kilka części."
86
+ elif language == 'por':
87
+ prompt = "O seu contributo foi demasiado longo. Por favor, tente um texto mais curto ou divida-o em várias partes."
88
+ elif language == 'ita':
89
+ prompt = "Il tuo input era troppo lungo. Per favore, prova un testo più corto o dividilo in più parti."
90
+ elif language == 'cmn':
91
+ prompt = "你的输入太长了。请尝试使用较短的文本或将其拆分为多个部分。"
92
+ elif language == 'vie':
93
+ prompt = "Đầu vào của bạn quá dài. Vui lòng thử một văn bản ngắn hơn hoặc chia nó thành nhiều phần."
94
+ else:
95
+ prompt = "Your input was too long. Please try either a shorter text or split it into several parts."
96
+ if self.current_language != "eng":
97
+ self.model.set_phonemizer_language("eng")
98
+ self.current_language = "eng"
99
+ if self.current_accent != "eng":
100
+ self.model.set_accent_language("eng")
101
+ self.current_accent = "eng"
102
 
103
+ print(prompt + "\n\n")
104
  wav, sr, fig = self.model(prompt,
105
  input_is_phones=False,
106
  duration_scaling_factor=duration_scaling_factor,
InferenceInterfaces/ToucanTTSInterface.py CHANGED
@@ -10,8 +10,8 @@ import torch
10
  from speechbrain.pretrained import EncoderClassifier
11
  from torchaudio.transforms import Resample
12
 
13
- from Architectures.ToucanTTS.InferenceToucanTTS import ToucanTTS
14
- from Architectures.Vocoder.HiFiGAN_Generator import HiFiGAN
15
  from Preprocessing.AudioPreprocessor import AudioPreprocessor
16
  from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
17
  from Preprocessing.TextFrontend import get_language_id
@@ -109,7 +109,7 @@ class ToucanTTSInterface(torch.nn.Module):
109
  self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, device=self.device)
110
 
111
  def set_accent_language(self, lang_id):
112
- if lang_id in ['ajp', 'ajt', 'lak', 'lno', 'nul', 'pii', 'plj', 'slq', 'smd', 'snb', 'tpw', 'wya', 'zua', 'en-us', 'en-sc', 'fr-be', 'fr-sw', 'pt-br', 'spa-lat', 'vi-ctr', 'vi-so']:
113
  if lang_id == 'vi-so' or lang_id == 'vi-ctr':
114
  lang_id = 'vie'
115
  elif lang_id == 'spa-lat':
@@ -121,7 +121,7 @@ class ToucanTTSInterface(torch.nn.Module):
121
  elif lang_id == 'en-sc' or lang_id == 'en-us':
122
  lang_id = 'eng'
123
  else:
124
- # no clue where these others are even coming from, they are not in ISO 639-2
125
  lang_id = 'eng'
126
 
127
  self.lang_id = get_language_id(lang_id).to(self.device)
@@ -139,7 +139,7 @@ class ToucanTTSInterface(torch.nn.Module):
139
  input_is_phones=False,
140
  return_plot_as_filepath=False,
141
  loudness_in_db=-24.0,
142
- prosody_creativity=0.5):
143
  """
144
  duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
145
  1.0 means no scaling happens, higher values increase durations for the whole
@@ -241,7 +241,7 @@ class ToucanTTSInterface(torch.nn.Module):
241
  dur_list=None,
242
  pitch_list=None,
243
  energy_list=None,
244
- prosody_creativity=0.5):
245
  """
246
  Args:
247
  silent: Whether to be verbose about the process
@@ -299,7 +299,7 @@ class ToucanTTSInterface(torch.nn.Module):
299
  pitch_variance_scale=1.0,
300
  energy_variance_scale=1.0,
301
  blocking=False,
302
- prosody_creativity=0.5):
303
  if text.strip() == "":
304
  return
305
  wav, sr = self(text,
 
10
  from speechbrain.pretrained import EncoderClassifier
11
  from torchaudio.transforms import Resample
12
 
13
+ from Modules.ToucanTTS.InferenceToucanTTS import ToucanTTS
14
+ from Modules.Vocoder.HiFiGAN_Generator import HiFiGAN
15
  from Preprocessing.AudioPreprocessor import AudioPreprocessor
16
  from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
17
  from Preprocessing.TextFrontend import get_language_id
 
109
  self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, device=self.device)
110
 
111
  def set_accent_language(self, lang_id):
112
+ if lang_id in {'ajp', 'ajt', 'lak', 'lno', 'nul', 'pii', 'plj', 'slq', 'smd', 'snb', 'tpw', 'wya', 'zua', 'en-us', 'en-sc', 'fr-be', 'fr-sw', 'pt-br', 'spa-lat', 'vi-ctr', 'vi-so'}:
113
  if lang_id == 'vi-so' or lang_id == 'vi-ctr':
114
  lang_id = 'vie'
115
  elif lang_id == 'spa-lat':
 
121
  elif lang_id == 'en-sc' or lang_id == 'en-us':
122
  lang_id = 'eng'
123
  else:
124
+ # no clue where these others are even coming from, they are not in ISO 639-3
125
  lang_id = 'eng'
126
 
127
  self.lang_id = get_language_id(lang_id).to(self.device)
 
139
  input_is_phones=False,
140
  return_plot_as_filepath=False,
141
  loudness_in_db=-24.0,
142
+ prosody_creativity=0.1):
143
  """
144
  duration_scaling_factor: reasonable values are 0.8 < scale < 1.2.
145
  1.0 means no scaling happens, higher values increase durations for the whole
 
241
  dur_list=None,
242
  pitch_list=None,
243
  energy_list=None,
244
+ prosody_creativity=0.1):
245
  """
246
  Args:
247
  silent: Whether to be verbose about the process
 
299
  pitch_variance_scale=1.0,
300
  energy_variance_scale=1.0,
301
  blocking=False,
302
+ prosody_creativity=0.1):
303
  if text.strip() == "":
304
  return
305
  wav, sr = self(text,
InferenceInterfaces/UtteranceCloner.py CHANGED
@@ -4,11 +4,11 @@ import numpy
4
  import soundfile as sf
5
  import torch
6
 
7
- from Architectures.Aligner.Aligner import Aligner
8
- from Architectures.ToucanTTS.DurationCalculator import DurationCalculator
9
- from Architectures.ToucanTTS.EnergyCalculator import EnergyCalculator
10
- from Architectures.ToucanTTS.PitchCalculator import Parselmouth
11
  from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
 
 
 
 
12
  from Preprocessing.AudioPreprocessor import AudioPreprocessor
13
  from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
14
  from Preprocessing.articulatory_features import get_feature_to_index_lookup
 
4
  import soundfile as sf
5
  import torch
6
 
 
 
 
 
7
  from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
8
+ from Modules.Aligner.Aligner import Aligner
9
+ from Modules.ToucanTTS.DurationCalculator import DurationCalculator
10
+ from Modules.ToucanTTS.EnergyCalculator import EnergyCalculator
11
+ from Modules.ToucanTTS.PitchCalculator import Parselmouth
12
  from Preprocessing.AudioPreprocessor import AudioPreprocessor
13
  from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
14
  from Preprocessing.articulatory_features import get_feature_to_index_lookup
Models/ToucanTTS_Meta/best.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f1f562f9473f227b4425938c80dec1808d0cd3a54fd3629b327613dae3be694
3
- size 112081651
 
 
 
 
Models/Vocoder/best.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:68a4db7d7d96a554eab75c5d8b79267760d7d4c7af65504947ab807ab18d680b
3
- size 56113099
 
 
 
 
{Architectures → Modules}/Aligner/Aligner.py RENAMED
File without changes
{Architectures → Modules}/Aligner/CodecAlignerDataset.py RENAMED
File without changes
{Architectures → Modules}/Aligner/README.md RENAMED
File without changes
{Architectures → Modules}/Aligner/Reconstructor.py RENAMED
File without changes
{Architectures → Modules}/Aligner/__init__.py RENAMED
File without changes
{Architectures → Modules}/Aligner/autoaligner_train_loop.py RENAMED
@@ -8,8 +8,8 @@ from torch.optim import RAdam
8
  from torch.utils.data.dataloader import DataLoader
9
  from tqdm import tqdm
10
 
11
- from Architectures.Aligner.Aligner import Aligner
12
- from Architectures.Aligner.Reconstructor import Reconstructor
13
  from Preprocessing.AudioPreprocessor import AudioPreprocessor
14
  from Preprocessing.EnCodecAudioPreprocessor import CodecAudioPreprocessor
15
 
 
8
  from torch.utils.data.dataloader import DataLoader
9
  from tqdm import tqdm
10
 
11
+ from Modules.Aligner.Aligner import Aligner
12
+ from Modules.Aligner.Reconstructor import Reconstructor
13
  from Preprocessing.AudioPreprocessor import AudioPreprocessor
14
  from Preprocessing.EnCodecAudioPreprocessor import CodecAudioPreprocessor
15
 
{Architectures → Modules}/ControllabilityGAN/GAN.py RENAMED
@@ -1,6 +1,6 @@
1
  import torch
2
 
3
- from Architectures.ControllabilityGAN.wgan.init_wgan import create_wgan
4
 
5
 
6
  class GanWrapper:
 
1
  import torch
2
 
3
+ from Modules.ControllabilityGAN.wgan.init_wgan import create_wgan
4
 
5
 
6
  class GanWrapper:
{Architectures → Modules}/ControllabilityGAN/__init__.py RENAMED
File without changes
{Architectures → Modules}/ControllabilityGAN/dataset/__init__.py RENAMED
File without changes
{Architectures → Modules}/ControllabilityGAN/dataset/speaker_embeddings_dataset.py RENAMED
File without changes
{Architectures → Modules}/ControllabilityGAN/wgan/__init__.py RENAMED
File without changes
{Architectures → Modules}/ControllabilityGAN/wgan/init_weights.py RENAMED
File without changes
{Architectures → Modules}/ControllabilityGAN/wgan/init_wgan.py RENAMED
@@ -1,7 +1,7 @@
1
  import torch
2
 
3
- from Architectures.ControllabilityGAN.wgan.resnet_init import init_resnet
4
- from Architectures.ControllabilityGAN.wgan.wgan_qc import WassersteinGanQuadraticCost
5
 
6
 
7
  def create_wgan(parameters, device, optimizer='adam'):
 
1
  import torch
2
 
3
+ from Modules.ControllabilityGAN.wgan.resnet_init import init_resnet
4
+ from Modules.ControllabilityGAN.wgan.wgan_qc import WassersteinGanQuadraticCost
5
 
6
 
7
  def create_wgan(parameters, device, optimizer='adam'):
{Architectures → Modules}/ControllabilityGAN/wgan/resnet_1.py RENAMED
File without changes
{Architectures → Modules}/ControllabilityGAN/wgan/resnet_init.py RENAMED
@@ -1,7 +1,7 @@
1
- from Architectures.ControllabilityGAN.wgan.init_weights import weights_init_D
2
- from Architectures.ControllabilityGAN.wgan.init_weights import weights_init_G
3
- from Architectures.ControllabilityGAN.wgan.resnet_1 import ResNet_D
4
- from Architectures.ControllabilityGAN.wgan.resnet_1 import ResNet_G
5
 
6
 
7
  def init_resnet(parameters):
 
1
+ from Modules.ControllabilityGAN.wgan.init_weights import weights_init_D
2
+ from Modules.ControllabilityGAN.wgan.init_weights import weights_init_G
3
+ from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_D
4
+ from Modules.ControllabilityGAN.wgan.resnet_1 import ResNet_G
5
 
6
 
7
  def init_resnet(parameters):
{Architectures → Modules}/ControllabilityGAN/wgan/wgan_qc.py RENAMED
File without changes
{Architectures → Modules}/EmbeddingModel/GST.py RENAMED
@@ -3,7 +3,7 @@
3
 
4
  import torch
5
 
6
- from Architectures.GeneralLayers.Attention import MultiHeadedAttention as BaseMultiHeadedAttention
7
 
8
 
9
  class GSTStyleEncoder(torch.nn.Module):
 
3
 
4
  import torch
5
 
6
+ from Modules.GeneralLayers.Attention import MultiHeadedAttention as BaseMultiHeadedAttention
7
 
8
 
9
  class GSTStyleEncoder(torch.nn.Module):
{Architectures → Modules}/EmbeddingModel/README.md RENAMED
File without changes
{Architectures → Modules}/EmbeddingModel/StyleEmbedding.py RENAMED
@@ -1,7 +1,7 @@
1
  import torch
2
 
3
- from Architectures.EmbeddingModel.GST import GSTStyleEncoder
4
- from Architectures.EmbeddingModel.StyleTTSEncoder import StyleEncoder as StyleTTSEncoder
5
 
6
 
7
  class StyleEmbedding(torch.nn.Module):
 
1
  import torch
2
 
3
+ from Modules.EmbeddingModel.GST import GSTStyleEncoder
4
+ from Modules.EmbeddingModel.StyleTTSEncoder import StyleEncoder as StyleTTSEncoder
5
 
6
 
7
  class StyleEmbedding(torch.nn.Module):
{Architectures → Modules}/EmbeddingModel/StyleTTSEncoder.py RENAMED
File without changes
{Architectures → Modules}/EmbeddingModel/__init__.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/Attention.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/ConditionalLayerNorm.py RENAMED
@@ -112,7 +112,6 @@ class AdaIN1d(nn.Module):
112
  self.fc = nn.Linear(style_dim, num_features * 2)
113
 
114
  def forward(self, x, s):
115
- s = torch.nn.functional.normalize(s)
116
  h = self.fc(s)
117
  h = h.view(h.size(0), h.size(1), 1)
118
  gamma, beta = torch.chunk(h, chunks=2, dim=1)
 
112
  self.fc = nn.Linear(style_dim, num_features * 2)
113
 
114
  def forward(self, x, s):
 
115
  h = self.fc(s)
116
  h = h.view(h.size(0), h.size(1), 1)
117
  gamma, beta = torch.chunk(h, chunks=2, dim=1)
{Architectures → Modules}/GeneralLayers/Conformer.py RENAMED
@@ -4,16 +4,16 @@ Taken from ESPNet, but heavily modified
4
 
5
  import torch
6
 
7
- from Architectures.GeneralLayers.Attention import RelPositionMultiHeadedAttention
8
- from Architectures.GeneralLayers.ConditionalLayerNorm import AdaIN1d
9
- from Architectures.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
10
- from Architectures.GeneralLayers.Convolution import ConvolutionModule
11
- from Architectures.GeneralLayers.EncoderLayer import EncoderLayer
12
- from Architectures.GeneralLayers.LayerNorm import LayerNorm
13
- from Architectures.GeneralLayers.MultiLayeredConv1d import MultiLayeredConv1d
14
- from Architectures.GeneralLayers.MultiSequential import repeat
15
- from Architectures.GeneralLayers.PositionalEncoding import RelPositionalEncoding
16
- from Architectures.GeneralLayers.Swish import Swish
17
  from Utility.utils import integrate_with_utt_embed
18
 
19
 
@@ -88,6 +88,8 @@ class Conformer(torch.nn.Module):
88
  self.language_embedding_projection = lambda x: x
89
  else:
90
  self.language_embedding_projection = torch.nn.Linear(lang_emb_size, attention_dim)
 
 
91
  # self-attention module definition
92
  encoder_selfattn_layer = RelPositionMultiHeadedAttention
93
  encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu)
@@ -130,6 +132,7 @@ class Conformer(torch.nn.Module):
130
  if lang_ids is not None:
131
  lang_embs = self.language_embedding(lang_ids)
132
  projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
 
133
  xs = xs + projected_lang_embs # offset phoneme representation by language specific offset
134
 
135
  xs = self.pos_enc(xs)
@@ -139,21 +142,28 @@ class Conformer(torch.nn.Module):
139
  if isinstance(xs, tuple):
140
  x, pos_emb = xs[0], xs[1]
141
  if self.conformer_type != "encoder":
142
- x = integrate_with_utt_embed(hs=x, utt_embeddings=utterance_embedding, projection=self.decoder_embedding_projections[encoder_index], embedding_training=self.use_conditional_layernorm_embedding_integration)
 
 
 
143
  xs = (x, pos_emb)
144
  else:
145
  if self.conformer_type != "encoder":
146
- xs = integrate_with_utt_embed(hs=xs, utt_embeddings=utterance_embedding, projection=self.decoder_embedding_projections[encoder_index], embedding_training=self.use_conditional_layernorm_embedding_integration)
 
 
 
147
  xs, masks = encoder(xs, masks)
148
 
149
  if isinstance(xs, tuple):
150
  xs = xs[0]
151
 
152
- if self.use_output_norm and not (self.utt_embed and self.conformer_type == "encoder"):
153
- xs = self.output_norm(xs)
154
-
155
  if self.utt_embed and self.conformer_type == "encoder":
156
- xs = integrate_with_utt_embed(hs=xs, utt_embeddings=utterance_embedding,
157
- projection=self.encoder_embedding_projection, embedding_training=self.use_conditional_layernorm_embedding_integration)
 
 
 
 
158
 
159
  return xs, masks
 
4
 
5
  import torch
6
 
7
+ from Modules.GeneralLayers.Attention import RelPositionMultiHeadedAttention
8
+ from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
9
+ from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
10
+ from Modules.GeneralLayers.Convolution import ConvolutionModule
11
+ from Modules.GeneralLayers.EncoderLayer import EncoderLayer
12
+ from Modules.GeneralLayers.LayerNorm import LayerNorm
13
+ from Modules.GeneralLayers.MultiLayeredConv1d import MultiLayeredConv1d
14
+ from Modules.GeneralLayers.MultiSequential import repeat
15
+ from Modules.GeneralLayers.PositionalEncoding import RelPositionalEncoding
16
+ from Modules.GeneralLayers.Swish import Swish
17
  from Utility.utils import integrate_with_utt_embed
18
 
19
 
 
88
  self.language_embedding_projection = lambda x: x
89
  else:
90
  self.language_embedding_projection = torch.nn.Linear(lang_emb_size, attention_dim)
91
+ self.language_emb_norm = LayerNorm(attention_dim)
92
+
93
  # self-attention module definition
94
  encoder_selfattn_layer = RelPositionMultiHeadedAttention
95
  encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu)
 
132
  if lang_ids is not None:
133
  lang_embs = self.language_embedding(lang_ids)
134
  projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
135
+ projected_lang_embs = self.language_emb_norm(projected_lang_embs)
136
  xs = xs + projected_lang_embs # offset phoneme representation by language specific offset
137
 
138
  xs = self.pos_enc(xs)
 
142
  if isinstance(xs, tuple):
143
  x, pos_emb = xs[0], xs[1]
144
  if self.conformer_type != "encoder":
145
+ x = integrate_with_utt_embed(hs=x,
146
+ utt_embeddings=utterance_embedding,
147
+ projection=self.decoder_embedding_projections[encoder_index],
148
+ embedding_training=self.use_conditional_layernorm_embedding_integration)
149
  xs = (x, pos_emb)
150
  else:
151
  if self.conformer_type != "encoder":
152
+ xs = integrate_with_utt_embed(hs=xs,
153
+ utt_embeddings=utterance_embedding,
154
+ projection=self.decoder_embedding_projections[encoder_index],
155
+ embedding_training=self.use_conditional_layernorm_embedding_integration)
156
  xs, masks = encoder(xs, masks)
157
 
158
  if isinstance(xs, tuple):
159
  xs = xs[0]
160
 
 
 
 
161
  if self.utt_embed and self.conformer_type == "encoder":
162
+ xs = integrate_with_utt_embed(hs=xs,
163
+ utt_embeddings=utterance_embedding,
164
+ projection=self.encoder_embedding_projection,
165
+ embedding_training=self.use_conditional_layernorm_embedding_integration)
166
+ elif self.use_output_norm:
167
+ xs = self.output_norm(xs)
168
 
169
  return xs, masks
{Architectures → Modules}/GeneralLayers/Convolution.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/DurationPredictor.py RENAMED
@@ -5,9 +5,9 @@
5
 
6
  import torch
7
 
8
- from Architectures.GeneralLayers.ConditionalLayerNorm import AdaIN1d
9
- from Architectures.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
10
- from Architectures.GeneralLayers.LayerNorm import LayerNorm
11
  from Utility.utils import integrate_with_utt_embed
12
 
13
 
 
5
 
6
  import torch
7
 
8
+ from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
9
+ from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
10
+ from Modules.GeneralLayers.LayerNorm import LayerNorm
11
  from Utility.utils import integrate_with_utt_embed
12
 
13
 
{Architectures → Modules}/GeneralLayers/EncoderLayer.py RENAMED
@@ -7,7 +7,7 @@
7
  import torch
8
  from torch import nn
9
 
10
- from Architectures.GeneralLayers.LayerNorm import LayerNorm
11
 
12
 
13
  class EncoderLayer(nn.Module):
 
7
  import torch
8
  from torch import nn
9
 
10
+ from Modules.GeneralLayers.LayerNorm import LayerNorm
11
 
12
 
13
  class EncoderLayer(nn.Module):
{Architectures → Modules}/GeneralLayers/LayerNorm.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/LengthRegulator.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/MultiLayeredConv1d.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/MultiSequential.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/PositionalEncoding.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/PositionwiseFeedForward.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/README.md RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/ResidualBlock.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/ResidualStack.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/STFT.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/Swish.py RENAMED
File without changes
{Architectures → Modules}/GeneralLayers/VariancePredictor.py RENAMED
@@ -6,9 +6,9 @@ from abc import ABC
6
 
7
  import torch
8
 
9
- from Architectures.GeneralLayers.ConditionalLayerNorm import AdaIN1d
10
- from Architectures.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
11
- from Architectures.GeneralLayers.LayerNorm import LayerNorm
12
  from Utility.utils import integrate_with_utt_embed
13
 
14
 
 
6
 
7
  import torch
8
 
9
+ from Modules.GeneralLayers.ConditionalLayerNorm import AdaIN1d
10
+ from Modules.GeneralLayers.ConditionalLayerNorm import ConditionalLayerNorm
11
+ from Modules.GeneralLayers.LayerNorm import LayerNorm
12
  from Utility.utils import integrate_with_utt_embed
13
 
14
 
{Architectures → Modules}/GeneralLayers/__init__.py RENAMED
File without changes
{Architectures → Modules}/README.md RENAMED
File without changes
{Architectures → Modules}/ToucanTTS/CodecDiscriminator.py RENAMED
File without changes
{Architectures → Modules}/ToucanTTS/CodecRefinementTransformer.py RENAMED
@@ -1,6 +1,6 @@
1
  import torch
2
 
3
- from Architectures.GeneralLayers.Conformer import Conformer
4
 
5
 
6
  class CodecRefinementTransformer(torch.nn.Module):
@@ -151,7 +151,7 @@ def one_hot_sequence_to_token_sequence(batch_of_indexes_one_hot_per_codebook):
151
 
152
 
153
  if __name__ == '__main__':
154
- from Architectures.ToucanTTS.ToucanTTS import ToucanTTS
155
  from Utility.utils import make_pad_mask
156
 
157
  # prepare dummy inputs
 
1
  import torch
2
 
3
+ from Modules.GeneralLayers.Conformer import Conformer
4
 
5
 
6
  class CodecRefinementTransformer(torch.nn.Module):
 
151
 
152
 
153
  if __name__ == '__main__':
154
+ from Modules.ToucanTTS.ToucanTTS import ToucanTTS
155
  from Utility.utils import make_pad_mask
156
 
157
  # prepare dummy inputs
{Architectures → Modules}/ToucanTTS/DurationCalculator.py RENAMED
File without changes
{Architectures → Modules}/ToucanTTS/EnergyCalculator.py RENAMED
@@ -5,7 +5,7 @@
5
  import torch
6
  import torch.nn.functional as F
7
 
8
- from Architectures.GeneralLayers.STFT import STFT
9
  from Utility.utils import pad_list
10
 
11
 
 
5
  import torch
6
  import torch.nn.functional as F
7
 
8
+ from Modules.GeneralLayers.STFT import STFT
9
  from Utility.utils import pad_list
10
 
11