Add support for extracting and feeding conditioning latents directly into the model
Browse files- Adds a new script and API endpoints for doing this
- Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost)
- Updates README
This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before
it becomes a problem..)
1) Does get_conditioning_latents.py work?
2) Can I feed those latents back into the model by creating a new voice?
3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py?
- README.md +17 -5
- tortoise/api.py +52 -23
- tortoise/do_tts.py +4 -8
- tortoise/get_conditioning_latents.py +30 -0
- tortoise/models/autoregressive.py +14 -17
- tortoise/models/diffusion_decoder.py +13 -10
- tortoise/read.py +4 -15
- tortoise/utils/audio.py +31 -0
README.md
CHANGED
@@ -118,12 +118,24 @@ These settings are not available in the normal scripts packaged with Tortoise. T
|
|
118 |
|
119 |
### Playing with the voice latent
|
120 |
|
121 |
-
Tortoise ingests reference clips by feeding them through individually through a small submodel that produces a point latent,
|
122 |
-
|
|
|
123 |
|
124 |
-
This lends itself to some neat tricks. For example, you can combine feed two different voices to tortoise and it will output
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
### Send me feedback!
|
129 |
|
|
|
118 |
|
119 |
### Playing with the voice latent
|
120 |
|
121 |
+
Tortoise ingests reference clips by feeding them through individually through a small submodel that produces a point latent,
|
122 |
+
then taking the mean of all of the produced latents. The experimentation I have done has indicated that these point latents
|
123 |
+
are quite expressive, affecting everything from tone to speaking rate to speech abnormalities.
|
124 |
|
125 |
+
This lends itself to some neat tricks. For example, you can combine feed two different voices to tortoise and it will output
|
126 |
+
what it thinks the "average" of those two voices sounds like.
|
127 |
+
|
128 |
+
#### Generating conditioning latents from voices
|
129 |
+
|
130 |
+
Use the script `get_conditioning_latents.py` to extract conditioning latents for a voice you have installed. This script
|
131 |
+
will dump the latents to a .pth pickle file. The file will contain a single tuple, (autoregressive_latent, diffusion_latent).
|
132 |
+
|
133 |
+
Alternatively, use the api.TextToSpeech.get_conditioning_latents() to fetch the latents.
|
134 |
+
|
135 |
+
#### Using raw conditioning latents to generate speech
|
136 |
+
|
137 |
+
After you've played with them, you can use them to generate speech by creating a subdirectory in voices/ with a single
|
138 |
+
".pth" file containing the pickled conditioning latents as a tuple (autoregressive_latent, diffusion_latent).
|
139 |
|
140 |
### Send me feedback!
|
141 |
|
tortoise/api.py
CHANGED
@@ -121,23 +121,14 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
|
121 |
return codes
|
122 |
|
123 |
|
124 |
-
def do_spectrogram_diffusion(diffusion_model, diffuser, latents,
|
125 |
"""
|
126 |
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
127 |
"""
|
128 |
with torch.no_grad():
|
129 |
-
cond_mels = []
|
130 |
-
for sample in conditioning_samples:
|
131 |
-
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
132 |
-
sample = torchaudio.functional.resample(sample, 22050, 24000)
|
133 |
-
sample = pad_or_truncate(sample, 102400)
|
134 |
-
cond_mel = wav_to_univnet_mel(sample.to(latents.device), do_normalization=False)
|
135 |
-
cond_mels.append(cond_mel)
|
136 |
-
cond_mels = torch.stack(cond_mels, dim=1)
|
137 |
-
|
138 |
output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
139 |
output_shape = (latents.shape[0], 100, output_seq_len)
|
140 |
-
precomputed_embeddings = diffusion_model.timestep_independent(latents,
|
141 |
|
142 |
noise = torch.randn(output_shape, device=latents.device) * temperature
|
143 |
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
@@ -204,7 +195,7 @@ class TextToSpeech:
|
|
204 |
self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
|
205 |
self.vocoder.eval(inference=True)
|
206 |
|
207 |
-
def tts_with_preset(self, text,
|
208 |
"""
|
209 |
Calls TTS with one of a set of preset generation parameters. Options:
|
210 |
'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
|
@@ -225,9 +216,43 @@ class TextToSpeech:
|
|
225 |
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
|
226 |
}
|
227 |
kwargs.update(presets[preset])
|
228 |
-
return self.tts(text,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
-
|
|
|
|
|
231 |
# autoregressive generation parameters follow
|
232 |
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
|
233 |
typical_sampling=False, typical_mass=.9,
|
@@ -240,6 +265,9 @@ class TextToSpeech:
|
|
240 |
Produces an audio clip of the given text being spoken with the given reference voice.
|
241 |
:param text: Text to be spoken.
|
242 |
:param voice_samples: List of 2 or more ~10 second reference clips which should be torch tensors containing 22.05kHz waveform data.
|
|
|
|
|
|
|
243 |
:param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP and CVVP models) clips are returned.
|
244 |
:param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true.
|
245 |
~~AUTOREGRESSIVE KNOBS~~
|
@@ -283,12 +311,10 @@ class TextToSpeech:
|
|
283 |
text = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda()
|
284 |
text = F.pad(text, (0, 1)) # This may not be necessary.
|
285 |
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
conds.append(format_conditioning(vs))
|
291 |
-
conds = torch.stack(conds, dim=1)
|
292 |
|
293 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
294 |
|
@@ -301,7 +327,7 @@ class TextToSpeech:
|
|
301 |
if verbose:
|
302 |
print("Generating autoregressive samples..")
|
303 |
for b in tqdm(range(num_batches), disable=not verbose):
|
304 |
-
codes = self.autoregressive.inference_speech(
|
305 |
do_sample=True,
|
306 |
top_p=top_p,
|
307 |
temperature=temperature,
|
@@ -340,16 +366,18 @@ class TextToSpeech:
|
|
340 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
341 |
# results, but will increase memory usage.
|
342 |
self.autoregressive = self.autoregressive.cuda()
|
343 |
-
best_latents = self.autoregressive(
|
344 |
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
|
345 |
return_latent=True, clip_inputs=False)
|
346 |
self.autoregressive = self.autoregressive.cpu()
|
|
|
347 |
|
348 |
if verbose:
|
349 |
print("Transforming autoregressive outputs into audio..")
|
350 |
wav_candidates = []
|
351 |
self.diffusion = self.diffusion.cuda()
|
352 |
self.vocoder = self.vocoder.cuda()
|
|
|
353 |
for b in range(best_results.shape[0]):
|
354 |
codes = best_results[b].unsqueeze(0)
|
355 |
latents = best_latents[b].unsqueeze(0)
|
@@ -365,7 +393,8 @@ class TextToSpeech:
|
|
365 |
latents = latents[:, :k]
|
366 |
break
|
367 |
|
368 |
-
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents,
|
|
|
369 |
wav = self.vocoder.inference(mel)
|
370 |
wav_candidates.append(wav.cpu())
|
371 |
self.diffusion = self.diffusion.cpu()
|
|
|
121 |
return codes
|
122 |
|
123 |
|
124 |
+
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True):
|
125 |
"""
|
126 |
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
127 |
"""
|
128 |
with torch.no_grad():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
130 |
output_shape = (latents.shape[0], 100, output_seq_len)
|
131 |
+
precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
|
132 |
|
133 |
noise = torch.randn(output_shape, device=latents.device) * temperature
|
134 |
mel = diffuser.p_sample_loop(diffusion_model, output_shape, noise=noise,
|
|
|
195 |
self.vocoder.load_state_dict(torch.load(f'{models_dir}/vocoder.pth')['model_g'])
|
196 |
self.vocoder.eval(inference=True)
|
197 |
|
198 |
+
def tts_with_preset(self, text, preset='fast', **kwargs):
|
199 |
"""
|
200 |
Calls TTS with one of a set of preset generation parameters. Options:
|
201 |
'ultra_fast': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
|
|
|
216 |
'high_quality': {'num_autoregressive_samples': 512, 'diffusion_iterations': 1024},
|
217 |
}
|
218 |
kwargs.update(presets[preset])
|
219 |
+
return self.tts(text, **kwargs)
|
220 |
+
|
221 |
+
def get_conditioning_latents(self, voice_samples):
|
222 |
+
"""
|
223 |
+
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
224 |
+
These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
|
225 |
+
properties.
|
226 |
+
:param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
|
227 |
+
"""
|
228 |
+
voice_samples = [v.to('cuda') for v in voice_samples]
|
229 |
+
|
230 |
+
auto_conds = []
|
231 |
+
if not isinstance(voice_samples, list):
|
232 |
+
voice_samples = [voice_samples]
|
233 |
+
for vs in voice_samples:
|
234 |
+
auto_conds.append(format_conditioning(vs))
|
235 |
+
auto_conds = torch.stack(auto_conds, dim=1)
|
236 |
+
self.autoregressive = self.autoregressive.cuda()
|
237 |
+
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
238 |
+
self.autoregressive = self.autoregressive.cpu()
|
239 |
+
|
240 |
+
diffusion_conds = []
|
241 |
+
for sample in voice_samples:
|
242 |
+
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
243 |
+
sample = torchaudio.functional.resample(sample, 22050, 24000)
|
244 |
+
sample = pad_or_truncate(sample, 102400)
|
245 |
+
cond_mel = wav_to_univnet_mel(sample.to(voice_samples.device), do_normalization=False)
|
246 |
+
diffusion_conds.append(cond_mel)
|
247 |
+
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
248 |
+
|
249 |
+
self.diffusion = self.diffusion.cuda()
|
250 |
+
diffusion_latent = self.diffusion.get_conditioning(diffusion_conds)
|
251 |
+
self.diffusion = self.diffusion.cpu()
|
252 |
|
253 |
+
return auto_latent, diffusion_latent
|
254 |
+
|
255 |
+
def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=True,
|
256 |
# autoregressive generation parameters follow
|
257 |
num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
|
258 |
typical_sampling=False, typical_mass=.9,
|
|
|
265 |
Produces an audio clip of the given text being spoken with the given reference voice.
|
266 |
:param text: Text to be spoken.
|
267 |
:param voice_samples: List of 2 or more ~10 second reference clips which should be torch tensors containing 22.05kHz waveform data.
|
268 |
+
:param conditioning_latents: A tuple of (autoregressive_conditioning_latent, diffusion_conditioning_latent), which
|
269 |
+
can be provided in lieu of voice_samples. This is ignored unless voice_samples=None.
|
270 |
+
Conditioning latents can be retrieved via get_conditioning_latents().
|
271 |
:param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP and CVVP models) clips are returned.
|
272 |
:param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true.
|
273 |
~~AUTOREGRESSIVE KNOBS~~
|
|
|
311 |
text = torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).cuda()
|
312 |
text = F.pad(text, (0, 1)) # This may not be necessary.
|
313 |
|
314 |
+
if voice_samples is not None:
|
315 |
+
auto_conditioning, diffusion_conditioning = self.get_conditioning_latents(voice_samples)
|
316 |
+
else:
|
317 |
+
auto_conditioning, diffusion_conditioning = conditioning_latents
|
|
|
|
|
318 |
|
319 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
320 |
|
|
|
327 |
if verbose:
|
328 |
print("Generating autoregressive samples..")
|
329 |
for b in tqdm(range(num_batches), disable=not verbose):
|
330 |
+
codes = self.autoregressive.inference_speech(auto_conditioning, text,
|
331 |
do_sample=True,
|
332 |
top_p=top_p,
|
333 |
temperature=temperature,
|
|
|
366 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
367 |
# results, but will increase memory usage.
|
368 |
self.autoregressive = self.autoregressive.cuda()
|
369 |
+
best_latents = self.autoregressive(auto_conditioning, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
|
370 |
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
|
371 |
return_latent=True, clip_inputs=False)
|
372 |
self.autoregressive = self.autoregressive.cpu()
|
373 |
+
del auto_conditioning
|
374 |
|
375 |
if verbose:
|
376 |
print("Transforming autoregressive outputs into audio..")
|
377 |
wav_candidates = []
|
378 |
self.diffusion = self.diffusion.cuda()
|
379 |
self.vocoder = self.vocoder.cuda()
|
380 |
+
diffusion_conds =
|
381 |
for b in range(best_results.shape[0]):
|
382 |
codes = best_results[b].unsqueeze(0)
|
383 |
latents = best_latents[b].unsqueeze(0)
|
|
|
393 |
latents = latents[:, :k]
|
394 |
break
|
395 |
|
396 |
+
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
397 |
+
temperature=diffusion_temperature, verbose=verbose)
|
398 |
wav = self.vocoder.inference(mel)
|
399 |
wav_candidates.append(wav.cpu())
|
400 |
self.diffusion = self.diffusion.cpu()
|
tortoise/do_tts.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4 |
import torchaudio
|
5 |
|
6 |
from api import TextToSpeech
|
7 |
-
from tortoise.utils.audio import load_audio, get_voices
|
8 |
|
9 |
if __name__ == '__main__':
|
10 |
parser = argparse.ArgumentParser()
|
@@ -21,14 +21,10 @@ if __name__ == '__main__':
|
|
21 |
|
22 |
tts = TextToSpeech()
|
23 |
|
24 |
-
voices = get_voices()
|
25 |
selected_voices = args.voice.split(',')
|
26 |
for voice in selected_voices:
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
c = load_audio(cond_path, 22050)
|
31 |
-
conds.append(c)
|
32 |
-
gen = tts.tts_with_preset(args.text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
33 |
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
34 |
|
|
|
4 |
import torchaudio
|
5 |
|
6 |
from api import TextToSpeech
|
7 |
+
from tortoise.utils.audio import load_audio, get_voices, load_voice
|
8 |
|
9 |
if __name__ == '__main__':
|
10 |
parser = argparse.ArgumentParser()
|
|
|
21 |
|
22 |
tts = TextToSpeech()
|
23 |
|
|
|
24 |
selected_voices = args.voice.split(',')
|
25 |
for voice in selected_voices:
|
26 |
+
voice_samples, conditioning_latents = load_voice(voice)
|
27 |
+
gen = tts.tts_with_preset(args.text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
|
28 |
+
preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
|
|
|
|
|
|
29 |
torchaudio.save(os.path.join(args.output_path, f'{voice}.wav'), gen.squeeze(0).cpu(), 24000)
|
30 |
|
tortoise/get_conditioning_latents.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from api import TextToSpeech
|
6 |
+
from tortoise.utils.audio import load_audio, get_voices
|
7 |
+
|
8 |
+
"""
|
9 |
+
Dumps the conditioning latents for the specified voice to disk. These are expressive latents which can be used for
|
10 |
+
other ML models, or can be augmented manually and fed back into Tortoise to affect vocal qualities.
|
11 |
+
"""
|
12 |
+
if __name__ == '__main__':
|
13 |
+
parser = argparse.ArgumentParser()
|
14 |
+
parser.add_argument('--voice', type=str, help='Selects the voice to convert to conditioning latents', default='pat')
|
15 |
+
parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/conditioning_latents')
|
16 |
+
args = parser.parse_args()
|
17 |
+
os.makedirs(args.output_path, exist_ok=True)
|
18 |
+
|
19 |
+
tts = TextToSpeech()
|
20 |
+
voices = get_voices()
|
21 |
+
selected_voices = args.voice.split(',')
|
22 |
+
for voice in selected_voices:
|
23 |
+
cond_paths = voices[voice]
|
24 |
+
conds = []
|
25 |
+
for cond_path in cond_paths:
|
26 |
+
c = load_audio(cond_path, 22050)
|
27 |
+
conds.append(c)
|
28 |
+
conditioning_latents = tts.get_conditioning_latents(conds)
|
29 |
+
torch.save(conditioning_latents, os.path.join(args.output_path, f'{voice}.pth'))
|
30 |
+
|
tortoise/models/autoregressive.py
CHANGED
@@ -390,6 +390,17 @@ class UnifiedVoice(nn.Module):
|
|
390 |
else:
|
391 |
return first_logits
|
392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
def forward(self, speech_conditioning_input, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
|
394 |
return_latent=False, clip_inputs=True):
|
395 |
"""
|
@@ -424,14 +435,7 @@ class UnifiedVoice(nn.Module):
|
|
424 |
text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token)
|
425 |
mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token)
|
426 |
|
427 |
-
|
428 |
-
conds = []
|
429 |
-
for j in range(speech_conditioning_input.shape[1]):
|
430 |
-
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
|
431 |
-
conds = torch.stack(conds, dim=1)
|
432 |
-
if self.average_conditioning_embeddings:
|
433 |
-
conds = conds.mean(dim=1).unsqueeze(1)
|
434 |
-
|
435 |
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
436 |
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
|
437 |
mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
|
@@ -516,7 +520,7 @@ class UnifiedVoice(nn.Module):
|
|
516 |
loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
|
517 |
return loss_mel.mean()
|
518 |
|
519 |
-
def inference_speech(self,
|
520 |
max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
|
521 |
seq_length = self.max_mel_tokens + self.max_text_tokens + 2
|
522 |
if not hasattr(self, 'inference_model'):
|
@@ -536,14 +540,7 @@ class UnifiedVoice(nn.Module):
|
|
536 |
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
537 |
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
|
538 |
|
539 |
-
|
540 |
-
conds = []
|
541 |
-
for j in range(speech_conditioning_input.shape[1]):
|
542 |
-
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
|
543 |
-
conds = torch.stack(conds, dim=1)
|
544 |
-
if self.average_conditioning_embeddings:
|
545 |
-
conds = conds.mean(dim=1).unsqueeze(1)
|
546 |
-
|
547 |
emb = torch.cat([conds, text_emb], dim=1)
|
548 |
self.inference_model.store_mel_emb(emb)
|
549 |
|
|
|
390 |
else:
|
391 |
return first_logits
|
392 |
|
393 |
+
def get_conditioning(self, speech_conditioning_input):
|
394 |
+
speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(
|
395 |
+
speech_conditioning_input.shape) == 3 else speech_conditioning_input
|
396 |
+
conds = []
|
397 |
+
for j in range(speech_conditioning_input.shape[1]):
|
398 |
+
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
|
399 |
+
conds = torch.stack(conds, dim=1)
|
400 |
+
if self.average_conditioning_embeddings:
|
401 |
+
conds = conds.mean(dim=1).unsqueeze(1)
|
402 |
+
return conds
|
403 |
+
|
404 |
def forward(self, speech_conditioning_input, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
|
405 |
return_latent=False, clip_inputs=True):
|
406 |
"""
|
|
|
435 |
text_inputs = F.pad(text_inputs, (0,1), value=self.stop_text_token)
|
436 |
mel_codes = F.pad(mel_codes, (0,1), value=self.stop_mel_token)
|
437 |
|
438 |
+
conds = self.get_conditioning(speech_conditioning_input)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
440 |
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
|
441 |
mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
|
|
|
520 |
loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
|
521 |
return loss_mel.mean()
|
522 |
|
523 |
+
def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
|
524 |
max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
|
525 |
seq_length = self.max_mel_tokens + self.max_text_tokens + 2
|
526 |
if not hasattr(self, 'inference_model'):
|
|
|
540 |
text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
|
541 |
text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
|
542 |
|
543 |
+
conds = speech_conditioning_latent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
emb = torch.cat([conds, text_emb], dim=1)
|
545 |
self.inference_model.store_mel_emb(emb)
|
546 |
|
tortoise/models/diffusion_decoder.py
CHANGED
@@ -219,18 +219,21 @@ class DiffusionTts(nn.Module):
|
|
219 |
}
|
220 |
return groups
|
221 |
|
222 |
-
def
|
223 |
-
# Shuffle aligned_latent to BxCxS format
|
224 |
-
if is_latent(aligned_conditioning):
|
225 |
-
aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
|
226 |
-
|
227 |
-
# Note: this block does not need to repeated on inference, since it is not timestep-dependent or x-dependent.
|
228 |
speech_conditioning_input = conditioning_input.unsqueeze(1) if len(
|
229 |
conditioning_input.shape) == 3 else conditioning_input
|
230 |
conds = []
|
231 |
for j in range(speech_conditioning_input.shape[1]):
|
232 |
conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
|
233 |
conds = torch.cat(conds, dim=-1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
cond_emb = conds.mean(dim=-1)
|
235 |
cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1)
|
236 |
if is_latent(aligned_conditioning):
|
@@ -257,19 +260,19 @@ class DiffusionTts(nn.Module):
|
|
257 |
mel_pred = mel_pred * unconditioned_batches.logical_not()
|
258 |
return expanded_code_emb, mel_pred
|
259 |
|
260 |
-
def forward(self, x, timesteps, aligned_conditioning=None,
|
261 |
"""
|
262 |
Apply the model to an input batch.
|
263 |
|
264 |
:param x: an [N x C x ...] Tensor of inputs.
|
265 |
:param timesteps: a 1-D batch of timesteps.
|
266 |
:param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
|
267 |
-
:param
|
268 |
:param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent()
|
269 |
:param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
|
270 |
:return: an [N x C x ...] Tensor of outputs.
|
271 |
"""
|
272 |
-
assert precomputed_aligned_embeddings is not None or (aligned_conditioning is not None and
|
273 |
assert not (return_code_pred and precomputed_aligned_embeddings is not None) # These two are mutually exclusive.
|
274 |
|
275 |
unused_params = []
|
@@ -281,7 +284,7 @@ class DiffusionTts(nn.Module):
|
|
281 |
if precomputed_aligned_embeddings is not None:
|
282 |
code_emb = precomputed_aligned_embeddings
|
283 |
else:
|
284 |
-
code_emb, mel_pred = self.timestep_independent(aligned_conditioning,
|
285 |
if is_latent(aligned_conditioning):
|
286 |
unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters()))
|
287 |
else:
|
|
|
219 |
}
|
220 |
return groups
|
221 |
|
222 |
+
def get_conditioning(self, conditioning_input):
|
|
|
|
|
|
|
|
|
|
|
223 |
speech_conditioning_input = conditioning_input.unsqueeze(1) if len(
|
224 |
conditioning_input.shape) == 3 else conditioning_input
|
225 |
conds = []
|
226 |
for j in range(speech_conditioning_input.shape[1]):
|
227 |
conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
|
228 |
conds = torch.cat(conds, dim=-1)
|
229 |
+
return conds
|
230 |
+
|
231 |
+
def timestep_independent(self, aligned_conditioning, conditioning_latent, expected_seq_len, return_code_pred):
|
232 |
+
# Shuffle aligned_latent to BxCxS format
|
233 |
+
if is_latent(aligned_conditioning):
|
234 |
+
aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
|
235 |
+
|
236 |
+
conds = conditioning_latent
|
237 |
cond_emb = conds.mean(dim=-1)
|
238 |
cond_scale, cond_shift = torch.chunk(cond_emb, 2, dim=1)
|
239 |
if is_latent(aligned_conditioning):
|
|
|
260 |
mel_pred = mel_pred * unconditioned_batches.logical_not()
|
261 |
return expanded_code_emb, mel_pred
|
262 |
|
263 |
+
def forward(self, x, timesteps, aligned_conditioning=None, conditioning_latent=None, precomputed_aligned_embeddings=None, conditioning_free=False, return_code_pred=False):
|
264 |
"""
|
265 |
Apply the model to an input batch.
|
266 |
|
267 |
:param x: an [N x C x ...] Tensor of inputs.
|
268 |
:param timesteps: a 1-D batch of timesteps.
|
269 |
:param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
|
270 |
+
:param conditioning_latent: a pre-computed conditioning latent; see get_conditioning().
|
271 |
:param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent()
|
272 |
:param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
|
273 |
:return: an [N x C x ...] Tensor of outputs.
|
274 |
"""
|
275 |
+
assert precomputed_aligned_embeddings is not None or (aligned_conditioning is not None and conditioning_latent is not None)
|
276 |
assert not (return_code_pred and precomputed_aligned_embeddings is not None) # These two are mutually exclusive.
|
277 |
|
278 |
unused_params = []
|
|
|
284 |
if precomputed_aligned_embeddings is not None:
|
285 |
code_emb = precomputed_aligned_embeddings
|
286 |
else:
|
287 |
+
code_emb, mel_pred = self.timestep_independent(aligned_conditioning, conditioning_latent, x.shape[-1], True)
|
288 |
if is_latent(aligned_conditioning):
|
289 |
unused_params.extend(list(self.code_converter.parameters()) + list(self.code_embedding.parameters()))
|
290 |
else:
|
tortoise/read.py
CHANGED
@@ -5,7 +5,7 @@ import torch
|
|
5 |
import torchaudio
|
6 |
|
7 |
from api import TextToSpeech
|
8 |
-
from tortoise.utils.audio import load_audio, get_voices
|
9 |
|
10 |
|
11 |
def split_and_recombine_text(texts, desired_length=200, max_len=300):
|
@@ -40,7 +40,6 @@ if __name__ == '__main__':
|
|
40 |
args = parser.parse_args()
|
41 |
|
42 |
outpath = args.output_path
|
43 |
-
voices = get_voices()
|
44 |
selected_voices = args.voice.split(',')
|
45 |
regenerate = args.regenerate
|
46 |
if regenerate is not None:
|
@@ -58,25 +57,15 @@ if __name__ == '__main__':
|
|
58 |
voice_sel = selected_voice.split('&')
|
59 |
else:
|
60 |
voice_sel = [selected_voice]
|
61 |
-
cond_paths = []
|
62 |
-
for vsel in voice_sel:
|
63 |
-
if vsel not in voices.keys():
|
64 |
-
print(f'Error: voice {vsel} not available. Skipping.')
|
65 |
-
continue
|
66 |
-
cond_paths.extend(voices[vsel])
|
67 |
-
if not cond_paths:
|
68 |
-
print('Error: no valid voices specified. Try again.')
|
69 |
|
70 |
-
|
71 |
-
for cond_path in cond_paths:
|
72 |
-
c = load_audio(cond_path, 22050)
|
73 |
-
conds.append(c)
|
74 |
all_parts = []
|
75 |
for j, text in enumerate(texts):
|
76 |
if regenerate is not None and j not in regenerate:
|
77 |
all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000))
|
78 |
continue
|
79 |
-
gen = tts.tts_with_preset(text,
|
|
|
80 |
gen = gen.squeeze(0).cpu()
|
81 |
torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000)
|
82 |
all_parts.append(gen)
|
|
|
5 |
import torchaudio
|
6 |
|
7 |
from api import TextToSpeech
|
8 |
+
from tortoise.utils.audio import load_audio, get_voices, load_voices
|
9 |
|
10 |
|
11 |
def split_and_recombine_text(texts, desired_length=200, max_len=300):
|
|
|
40 |
args = parser.parse_args()
|
41 |
|
42 |
outpath = args.output_path
|
|
|
43 |
selected_voices = args.voice.split(',')
|
44 |
regenerate = args.regenerate
|
45 |
if regenerate is not None:
|
|
|
57 |
voice_sel = selected_voice.split('&')
|
58 |
else:
|
59 |
voice_sel = [selected_voice]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
+
voice_samples, conditioning_latents = load_voices(voice_sel)
|
|
|
|
|
|
|
62 |
all_parts = []
|
63 |
for j, text in enumerate(texts):
|
64 |
if regenerate is not None and j not in regenerate:
|
65 |
all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000))
|
66 |
continue
|
67 |
+
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents,
|
68 |
+
preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider)
|
69 |
gen = gen.squeeze(0).cpu()
|
70 |
torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000)
|
71 |
all_parts.append(gen)
|
tortoise/utils/audio.py
CHANGED
@@ -91,6 +91,37 @@ def get_voices():
|
|
91 |
return voices
|
92 |
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
class TacotronSTFT(torch.nn.Module):
|
95 |
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
96 |
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
|
|
91 |
return voices
|
92 |
|
93 |
|
94 |
+
def load_voice(voice):
|
95 |
+
voices = get_voices()
|
96 |
+
paths = voices[voice]
|
97 |
+
if len(paths) == 1 and paths[0].endswith('.pth'):
|
98 |
+
return None, torch.load(paths[0])
|
99 |
+
else:
|
100 |
+
conds = []
|
101 |
+
for cond_path in paths:
|
102 |
+
c = load_audio(cond_path, 22050)
|
103 |
+
conds.append(c)
|
104 |
+
return conds, None
|
105 |
+
|
106 |
+
|
107 |
+
def load_voices(voices):
|
108 |
+
latents = []
|
109 |
+
clips = []
|
110 |
+
for voice in voices:
|
111 |
+
latent, clip = load_voice(voice)
|
112 |
+
if latent is None:
|
113 |
+
assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
|
114 |
+
clips.extend(clip)
|
115 |
+
elif voice is None:
|
116 |
+
assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
|
117 |
+
latents.append(latent)
|
118 |
+
if len(latents) == 0:
|
119 |
+
return clips
|
120 |
+
else:
|
121 |
+
latents = torch.stack(latents, dim=0)
|
122 |
+
return latents.mean(dim=0)
|
123 |
+
|
124 |
+
|
125 |
class TacotronSTFT(torch.nn.Module):
|
126 |
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
127 |
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|