more fixes
Browse files- tortoise/api.py +10 -9
- tortoise/get_conditioning_latents.py +2 -2
- tortoise/models/autoregressive.py +2 -65
- tortoise/utils/audio.py +6 -3
tortoise/api.py
CHANGED
@@ -194,8 +194,7 @@ class TextToSpeech:
|
|
194 |
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
195 |
model_dim=1024,
|
196 |
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
197 |
-
train_solo_embeddings=False
|
198 |
-
average_conditioning_embeddings=True).cpu().eval()
|
199 |
self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth'))
|
200 |
|
201 |
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
@@ -244,7 +243,7 @@ class TextToSpeech:
|
|
244 |
kwargs.update(presets[preset])
|
245 |
return self.tts(text, **kwargs)
|
246 |
|
247 |
-
def get_conditioning_latents(self, voice_samples):
|
248 |
"""
|
249 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
250 |
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
@@ -268,7 +267,7 @@ class TextToSpeech:
|
|
268 |
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
269 |
sample = torchaudio.functional.resample(sample, 22050, 24000)
|
270 |
sample = pad_or_truncate(sample, 102400)
|
271 |
-
cond_mel = wav_to_univnet_mel(sample.to(
|
272 |
diffusion_conds.append(cond_mel)
|
273 |
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
274 |
|
@@ -276,7 +275,10 @@ class TextToSpeech:
|
|
276 |
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
277 |
self.diffusion = self.diffusion.cpu()
|
278 |
|
279 |
-
|
|
|
|
|
|
|
280 |
|
281 |
def get_random_conditioning_latents(self):
|
282 |
# Lazy-load the RLG models.
|
@@ -295,7 +297,6 @@ class TextToSpeech:
|
|
295 |
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
|
296 |
# autoregressive generation parameters follow
|
297 |
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
|
298 |
-
typical_sampling=False, typical_mass=.9,
|
299 |
# CLVP & CVVP parameters
|
300 |
clvp_cvvp_slider=.5,
|
301 |
# diffusion generation parameters follow
|
@@ -354,13 +355,13 @@ class TextToSpeech:
|
|
354 |
|
355 |
auto_conds = None
|
356 |
if voice_samples is not None:
|
357 |
-
auto_conditioning, diffusion_conditioning, auto_conds = self.get_conditioning_latents(voice_samples)
|
358 |
elif conditioning_latents is not None:
|
359 |
auto_conditioning, diffusion_conditioning = conditioning_latents
|
360 |
else:
|
361 |
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
|
362 |
-
|
363 |
-
|
364 |
|
365 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
366 |
|
|
|
194 |
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
195 |
model_dim=1024,
|
196 |
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
197 |
+
train_solo_embeddings=False).cpu().eval()
|
|
|
198 |
self.autoregressive.load_state_dict(torch.load(f'{models_dir}/autoregressive.pth'))
|
199 |
|
200 |
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
|
|
243 |
kwargs.update(presets[preset])
|
244 |
return self.tts(text, **kwargs)
|
245 |
|
246 |
+
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
247 |
"""
|
248 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
249 |
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
|
|
267 |
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
268 |
sample = torchaudio.functional.resample(sample, 22050, 24000)
|
269 |
sample = pad_or_truncate(sample, 102400)
|
270 |
+
cond_mel = wav_to_univnet_mel(sample.to('cuda'), do_normalization=False)
|
271 |
diffusion_conds.append(cond_mel)
|
272 |
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
273 |
|
|
|
275 |
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
276 |
self.diffusion = self.diffusion.cpu()
|
277 |
|
278 |
+
if return_mels:
|
279 |
+
return auto_latent, diffusion_latent, auto_conds, diffusion_conds
|
280 |
+
else:
|
281 |
+
return auto_latent, diffusion_latent
|
282 |
|
283 |
def get_random_conditioning_latents(self):
|
284 |
# Lazy-load the RLG models.
|
|
|
297 |
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
|
298 |
# autoregressive generation parameters follow
|
299 |
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
|
|
|
300 |
# CLVP & CVVP parameters
|
301 |
clvp_cvvp_slider=.5,
|
302 |
# diffusion generation parameters follow
|
|
|
355 |
|
356 |
auto_conds = None
|
357 |
if voice_samples is not None:
|
358 |
+
auto_conditioning, diffusion_conditioning, auto_conds, _ = self.get_conditioning_latents(voice_samples, return_mels=True)
|
359 |
elif conditioning_latents is not None:
|
360 |
auto_conditioning, diffusion_conditioning = conditioning_latents
|
361 |
else:
|
362 |
auto_conditioning, diffusion_conditioning = self.get_random_conditioning_latents()
|
363 |
+
auto_conditioning = auto_conditioning.cuda()
|
364 |
+
diffusion_conditioning = diffusion_conditioning.cuda()
|
365 |
|
366 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
367 |
|
tortoise/get_conditioning_latents.py
CHANGED
@@ -11,8 +11,8 @@ other ML models, or can be augmented manually and fed back into Tortoise to affe
|
|
11 |
"""
|
12 |
if __name__ == '__main__':
|
13 |
parser = argparse.ArgumentParser()
|
14 |
-
parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='
|
15 |
-
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/conditioning_latents')
|
16 |
args = parser.parse_args()
|
17 |
os.makedirs(args.output_path, exist_ok=True)
|
18 |
|
|
|
11 |
"""
|
12 |
if __name__ == '__main__':
|
13 |
parser = argparse.ArgumentParser()
|
14 |
+
parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat2')
|
15 |
+
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='../results/conditioning_latents')
|
16 |
args = parser.parse_args()
|
17 |
os.makedirs(args.output_path, exist_ok=True)
|
18 |
|
tortoise/models/autoregressive.py
CHANGED
@@ -280,8 +280,7 @@ class UnifiedVoice(nn.Module):
|
|
280 |
mel_length_compression=1024, number_text_tokens=256,
|
281 |
start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
|
282 |
stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
|
283 |
-
checkpointing=True,
|
284 |
-
types=1):
|
285 |
"""
|
286 |
Args:
|
287 |
layers: Number of layers in transformer stack.
|
@@ -300,7 +299,6 @@ class UnifiedVoice(nn.Module):
|
|
300 |
train_solo_embeddings:
|
301 |
use_mel_codes_as_input:
|
302 |
checkpointing:
|
303 |
-
average_conditioning_embeddings: Whether or not conditioning embeddings should be averaged, instead of fed piecewise into the model.
|
304 |
"""
|
305 |
super().__init__()
|
306 |
|
@@ -318,7 +316,6 @@ class UnifiedVoice(nn.Module):
|
|
318 |
self.max_conditioning_inputs = max_conditioning_inputs
|
319 |
self.mel_length_compression = mel_length_compression
|
320 |
self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
|
321 |
-
self.average_conditioning_embeddings = average_conditioning_embeddings
|
322 |
self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim)
|
323 |
if use_mel_codes_as_input:
|
324 |
self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim)
|
@@ -397,8 +394,7 @@ class UnifiedVoice(nn.Module):
|
|
397 |
for j in range(speech_conditioning_input.shape[1]):
|
398 |
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
|
399 |
conds = torch.stack(conds, dim=1)
|
400 |
-
|
401 |
-
conds = conds.mean(dim=1).unsqueeze(1)
|
402 |
return conds
|
403 |
|
404 |
def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
|
@@ -461,65 +457,6 @@ class UnifiedVoice(nn.Module):
|
|
461 |
loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
|
462 |
return loss_text.mean(), loss_mel.mean(), mel_logits
|
463 |
|
464 |
-
def text_forward(self, speech_conditioning_input, text_inputs, text_lengths):
|
465 |
-
"""
|
466 |
-
Performs autoregressive modeling on only text. Still requires a speech_conditioning_input due to the way the
|
467 |
-
model inputs are formatted. Just provide any audio clip (arguably, zeros could be provided).
|
468 |
-
"""
|
469 |
-
assert self.max_text_tokens >= text_inputs.shape[1], f'{text_inputs.shape[1]}'
|
470 |
-
|
471 |
-
# This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
|
472 |
-
# chopping the inputs by the maximum actual length.
|
473 |
-
max_text_len = text_lengths.max()
|
474 |
-
text_inputs = F.pad(text_inputs[:, :max_text_len], (0,1), value=self.stop_text_token)
|
475 |
-
|
476 |
-
speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input
|
477 |
-
conds = []
|
478 |
-
for j in range(speech_conditioning_input.shape[1]):
|
479 |
-
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
|
480 |
-
conds = torch.stack(conds, dim=1)
|
481 |
-
if self.average_conditioning_embeddings:
|
482 |
-
conds = conds.mean(dim=1).unsqueeze(1)
|
483 |
-
|
484 |
-
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
485 |
-
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs) + self.text_solo_embedding
|
486 |
-
text_logits = self.get_logits(conds, text_emb, self.text_head)
|
487 |
-
loss_text = F.cross_entropy(text_logits, text_targets.long())
|
488 |
-
return loss_text.mean()
|
489 |
-
|
490 |
-
def speech_forward(self, speech_conditioning_input, mel_codes, wav_lengths, raw_mels=None):
|
491 |
-
"""
|
492 |
-
Performs autoregressive modeling on only speech data.
|
493 |
-
"""
|
494 |
-
assert self.max_mel_tokens >= mel_codes.shape[1], f'{mel_codes.shape[1]}'
|
495 |
-
|
496 |
-
# This model will receive micro-batches with a ton of padding for both the text and MELs. Ameliorate this by
|
497 |
-
# chopping the inputs by the maximum actual length.
|
498 |
-
max_mel_len = wav_lengths.max() // self.mel_length_compression
|
499 |
-
mel_codes = F.pad(mel_codes[:, :max_mel_len], (0,1), value=self.stop_mel_token)
|
500 |
-
mel_codes = self.set_mel_padding(mel_codes, wav_lengths)
|
501 |
-
if raw_mels is not None:
|
502 |
-
raw_mels = raw_mels[:, :, :max_mel_len*4]
|
503 |
-
|
504 |
-
speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(speech_conditioning_input.shape) == 3 else speech_conditioning_input
|
505 |
-
conds = []
|
506 |
-
for j in range(speech_conditioning_input.shape[1]):
|
507 |
-
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
|
508 |
-
conds = torch.stack(conds, dim=1)
|
509 |
-
if self.average_conditioning_embeddings:
|
510 |
-
conds = conds.mean(dim=1).unsqueeze(1)
|
511 |
-
|
512 |
-
mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
|
513 |
-
if raw_mels is not None:
|
514 |
-
mel_inp = F.pad(raw_mels, (0, 4))
|
515 |
-
else:
|
516 |
-
mel_inp = mel_codes
|
517 |
-
mel_emb = self.mel_embedding(mel_inp)
|
518 |
-
mel_emb = mel_emb + self.mel_pos_embedding(mel_codes) + self.mel_solo_embedding
|
519 |
-
mel_logits = self.get_logits(conds, mel_emb, self.mel_head)
|
520 |
-
loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
|
521 |
-
return loss_mel.mean()
|
522 |
-
|
523 |
def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
|
524 |
max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
|
525 |
seq_length = self.max_mel_tokens + self.max_text_tokens + 2
|
|
|
280 |
mel_length_compression=1024, number_text_tokens=256,
|
281 |
start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
|
282 |
stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
|
283 |
+
checkpointing=True, types=1):
|
|
|
284 |
"""
|
285 |
Args:
|
286 |
layers: Number of layers in transformer stack.
|
|
|
299 |
train_solo_embeddings:
|
300 |
use_mel_codes_as_input:
|
301 |
checkpointing:
|
|
|
302 |
"""
|
303 |
super().__init__()
|
304 |
|
|
|
316 |
self.max_conditioning_inputs = max_conditioning_inputs
|
317 |
self.mel_length_compression = mel_length_compression
|
318 |
self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
|
|
|
319 |
self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim)
|
320 |
if use_mel_codes_as_input:
|
321 |
self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim)
|
|
|
394 |
for j in range(speech_conditioning_input.shape[1]):
|
395 |
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
|
396 |
conds = torch.stack(conds, dim=1)
|
397 |
+
conds = conds.mean(dim=1)
|
|
|
398 |
return conds
|
399 |
|
400 |
def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
|
|
|
457 |
loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
|
458 |
return loss_text.mean(), loss_mel.mean(), mel_logits
|
459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
|
461 |
max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
|
462 |
seq_length = self.max_mel_tokens + self.max_text_tokens + 2
|
tortoise/utils/audio.py
CHANGED
@@ -87,7 +87,7 @@ def get_voices():
|
|
87 |
for sub in subs:
|
88 |
subj = os.path.join('voices', sub)
|
89 |
if os.path.isdir(subj):
|
90 |
-
voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3'))
|
91 |
return voices
|
92 |
|
93 |
|
@@ -111,6 +111,9 @@ def load_voices(voices):
|
|
111 |
latents = []
|
112 |
clips = []
|
113 |
for voice in voices:
|
|
|
|
|
|
|
114 |
latent, clip = load_voice(voice)
|
115 |
if latent is None:
|
116 |
assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
|
@@ -119,10 +122,10 @@ def load_voices(voices):
|
|
119 |
assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
|
120 |
latents.append(latent)
|
121 |
if len(latents) == 0:
|
122 |
-
return clips
|
123 |
else:
|
124 |
latents = torch.stack(latents, dim=0)
|
125 |
-
return latents.mean(dim=0)
|
126 |
|
127 |
|
128 |
class TacotronSTFT(torch.nn.Module):
|
|
|
87 |
for sub in subs:
|
88 |
subj = os.path.join('voices', sub)
|
89 |
if os.path.isdir(subj):
|
90 |
+
voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth'))
|
91 |
return voices
|
92 |
|
93 |
|
|
|
111 |
latents = []
|
112 |
clips = []
|
113 |
for voice in voices:
|
114 |
+
if voice == 'random':
|
115 |
+
print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
|
116 |
+
return None, None
|
117 |
latent, clip = load_voice(voice)
|
118 |
if latent is None:
|
119 |
assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
|
|
|
122 |
assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
|
123 |
latents.append(latent)
|
124 |
if len(latents) == 0:
|
125 |
+
return clips, None
|
126 |
else:
|
127 |
latents = torch.stack(latents, dim=0)
|
128 |
+
return None, latents.mean(dim=0)
|
129 |
|
130 |
|
131 |
class TacotronSTFT(torch.nn.Module):
|