jbetker commited on
Commit
fc8d52a
·
1 Parent(s): 76c30fe

update do_tts

Browse files
Files changed (2) hide show
  1. api.py +21 -7
  2. do_tts.py +9 -25
api.py CHANGED
@@ -157,10 +157,23 @@ class TextToSpeech:
157
 
158
  self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
159
  model_dim=1024,
160
- heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False,
161
  train_solo_embeddings=False,
162
  average_conditioning_embeddings=True).cpu().eval()
163
  self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  self.clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
166
  text_seq_len=350, text_heads=8,
@@ -202,7 +215,7 @@ class TextToSpeech:
202
 
203
  def tts(self, text, voice_samples, k=1,
204
  # autoregressive generation parameters follow
205
- num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8,
206
  # diffusion generation parameters follow
207
  diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
208
  **hf_generate_kwargs):
@@ -232,8 +245,9 @@ class TextToSpeech:
232
  num_return_sequences=self.autoregressive_batch_size,
233
  length_penalty=length_penalty,
234
  repetition_penalty=repetition_penalty,
 
235
  **hf_generate_kwargs)
236
- padding_needed = self.autoregressive.max_mel_tokens - codes.shape[1]
237
  codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
238
  samples.append(codes)
239
  self.autoregressive = self.autoregressive.cpu()
@@ -253,11 +267,11 @@ class TextToSpeech:
253
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
254
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
255
  # results, but will increase memory usage.
256
- self.autoregressive = self.autoregressive.cuda()
257
- best_latents = self.autoregressive(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
258
- torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
259
  return_latent=True, clip_inputs=False)
260
- self.autoregressive = self.autoregressive.cpu()
261
 
262
  print("Performing vocoding..")
263
  wav_candidates = []
 
157
 
158
  self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
159
  model_dim=1024,
160
+ heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
161
  train_solo_embeddings=False,
162
  average_conditioning_embeddings=True).cpu().eval()
163
  self.autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
164
+ '''
165
+ self.autoregressive = UnifiedVoice(max_mel_tokens=2048, max_text_tokens=1024, max_conditioning_inputs=1, layers=42,
166
+ model_dim=1152, heads=18, number_text_tokens=256, train_solo_embeddings=False,
167
+ average_conditioning_embeddings=True, types=2).cpu().eval()
168
+ self.autoregressive.load_state_dict(torch.load('X:\\dlas\\experiments\\train_gpt_tts_xl\\models\\15250_gpt_ema.pth'))
169
+ '''
170
+
171
+ self.autoregressive_for_diffusion = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
172
+ model_dim=1024,
173
+ heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
174
+ train_solo_embeddings=False,
175
+ average_conditioning_embeddings=True).cpu().eval()
176
+ self.autoregressive_for_diffusion.load_state_dict(torch.load('.models/autoregressive.pth'))
177
 
178
  self.clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
179
  text_seq_len=350, text_heads=8,
 
215
 
216
  def tts(self, text, voice_samples, k=1,
217
  # autoregressive generation parameters follow
218
+ num_autoregressive_samples=512, temperature=.8, length_penalty=1, repetition_penalty=2.0, top_p=.8, max_mel_tokens=500,
219
  # diffusion generation parameters follow
220
  diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0,
221
  **hf_generate_kwargs):
 
245
  num_return_sequences=self.autoregressive_batch_size,
246
  length_penalty=length_penalty,
247
  repetition_penalty=repetition_penalty,
248
+ max_generate_length=max_mel_tokens,
249
  **hf_generate_kwargs)
250
+ padding_needed = max_mel_tokens - codes.shape[1]
251
  codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
252
  samples.append(codes)
253
  self.autoregressive = self.autoregressive.cpu()
 
267
  # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
268
  # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
269
  # results, but will increase memory usage.
270
+ self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cuda()
271
+ best_latents = self.autoregressive_for_diffusion(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
272
+ torch.tensor([best_results.shape[-1]*self.autoregressive_for_diffusion.mel_length_compression], device=conds.device),
273
  return_latent=True, clip_inputs=False)
274
+ self.autoregressive_for_diffusion = self.autoregressive_for_diffusion.cpu()
275
 
276
  print("Performing vocoding..")
277
  wav_candidates = []
do_tts.py CHANGED
@@ -1,35 +1,17 @@
1
  import argparse
2
  import os
3
 
4
- import torch
5
- import torch.nn.functional as F
6
  import torchaudio
7
 
8
- from api import TextToSpeech, load_conditioning
9
- from utils.audio import load_audio
10
- from utils.tokenizer import VoiceBpeTokenizer
11
 
12
  if __name__ == '__main__':
13
- # These are voices drawn randomly from the training set. You are free to substitute your own voices in, but testing
14
- # has shown that the model does not generalize to new voices very well.
15
- preselected_cond_voices = {
16
- # Male voices
17
- 'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],
18
- 'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],
19
- 'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],
20
- 'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],
21
- 'obama': ['voices/obama/1.wav', 'voices/obama/2.wav'],
22
- # Female voices
23
- 'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],
24
- 'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],
25
- 'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],
26
- 'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],
27
- }
28
-
29
  parser = argparse.ArgumentParser()
30
  parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
31
- parser.add_argument('--voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='obama,dotrice,harris,lescault,otto,atkins,grace,kennard,mol')
32
- parser.add_argument('--num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=128)
 
33
  parser.add_argument('--batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
34
  parser.add_argument('--num_diffusion_samples', type=int, help='Number of outputs that progress to the diffusion stage.', default=16)
35
  parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
@@ -38,8 +20,10 @@ if __name__ == '__main__':
38
 
39
  tts = TextToSpeech(autoregressive_batch_size=args.batch_size)
40
 
41
- for voice in args.voice.split(','):
42
- cond_paths = preselected_cond_voices[voice]
 
 
43
  conds = []
44
  for cond_path in cond_paths:
45
  c = load_audio(cond_path, 22050)
 
1
  import argparse
2
  import os
3
 
 
 
4
  import torchaudio
5
 
6
+ from api import TextToSpeech
7
+ from utils.audio import load_audio, get_voices
 
8
 
9
  if __name__ == '__main__':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  parser = argparse.ArgumentParser()
11
  parser.add_argument('--text', type=str, help='Text to speak.', default="I am a language model that has learned to speak.")
12
+ parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
13
+ 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart')
14
+ parser.add_argument('--num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=256)
15
  parser.add_argument('--batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
16
  parser.add_argument('--num_diffusion_samples', type=int, help='Number of outputs that progress to the diffusion stage.', default=16)
17
  parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
 
20
 
21
  tts = TextToSpeech(autoregressive_batch_size=args.batch_size)
22
 
23
+ voices = get_voices()
24
+ selected_voices = args.voice.split(',')
25
+ for voice in selected_voices:
26
+ cond_paths = voices[voice]
27
  conds = []
28
  for cond_path in cond_paths:
29
  c = load_audio(cond_path, 22050)