Plachta commited on
Commit
9d434bb
·
1 Parent(s): 95019e6

Upload 67 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +281 -85
  2. descriptions.py +3 -4
  3. macros.py +39 -0
  4. presets/acou_1.npz +3 -0
  5. presets/acou_2.npz +3 -0
  6. presets/acou_3.npz +3 -0
  7. presets/acou_4.npz +3 -0
  8. presets/amused.npz +3 -0
  9. presets/anger.npz +3 -0
  10. presets/babara.npz +3 -0
  11. presets/bronya_1.npz +3 -0
  12. presets/dingzhen.npz +3 -0
  13. presets/disgust.npz +3 -0
  14. presets/emo_amused.npz +3 -0
  15. presets/emo_anger.npz +3 -0
  16. presets/emo_neutral.npz +3 -0
  17. presets/emo_sleepy.npz +3 -0
  18. presets/en2zh_tts_1.npz +3 -0
  19. presets/en2zh_tts_2.npz +3 -0
  20. presets/en2zh_tts_3.npz +3 -0
  21. presets/en2zh_tts_4.npz +3 -0
  22. presets/fuxuan_2.npz +3 -0
  23. presets/librispeech_1.npz +3 -0
  24. presets/librispeech_2.npz +3 -0
  25. presets/librispeech_3.npz +3 -0
  26. presets/librispeech_4.npz +3 -0
  27. presets/neutral.npz +3 -0
  28. presets/paimon_1.npz +3 -0
  29. presets/rosalia.npz +3 -0
  30. presets/seel.npz +3 -0
  31. presets/sleepiness.npz +3 -0
  32. presets/vctk_1.npz +3 -0
  33. presets/vctk_2.npz +3 -0
  34. presets/vctk_3.npz +3 -0
  35. presets/vctk_4.npz +3 -0
  36. presets/yaesakura.npz +3 -0
  37. presets/zh2en_tts_1.npz +3 -0
  38. presets/zh2en_tts_2.npz +3 -0
  39. presets/zh2en_tts_3.npz +3 -0
  40. presets/zh2en_tts_4.npz +3 -0
  41. requirements.txt +9 -3
  42. utils/__pycache__/__init__.cpython-38.pyc +0 -0
  43. utils/g2p/__pycache__/__init__.cpython-38.pyc +0 -0
  44. utils/g2p/__pycache__/cleaners.cpython-38.pyc +0 -0
  45. utils/g2p/__pycache__/english.cpython-38.pyc +0 -0
  46. utils/g2p/__pycache__/japanese.cpython-38.pyc +0 -0
  47. utils/g2p/__pycache__/mandarin.cpython-38.pyc +0 -0
  48. utils/g2p/__pycache__/symbols.cpython-38.pyc +0 -0
  49. utils/generation.py +256 -0
  50. utils/prompt_making.py +115 -0
app.py CHANGED
@@ -4,10 +4,18 @@ import os
4
  import pathlib
5
  import time
6
  import tempfile
7
- from pathlib import Path
8
- temp = pathlib.WindowsPath
9
- pathlib.WindowsPath = pathlib.PosixPath
 
 
 
 
10
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 
 
 
 
11
  import torch
12
  import torchaudio
13
  import random
@@ -22,48 +30,21 @@ from data.collation import get_text_token_collater
22
  from models.vallex import VALLE
23
  from utils.g2p import PhonemeBpeTokenizer
24
  from descriptions import *
 
25
 
26
  import gradio as gr
27
  import whisper
28
- torch.set_num_threads(1)
29
- torch.set_num_interop_threads(1)
30
- torch._C._jit_set_profiling_executor(False)
31
- torch._C._jit_set_profiling_mode(False)
32
- torch._C._set_graph_executor_optimize(False)
33
- # torch.manual_seed(42)
34
-
35
- lang2token = {
36
- 'zh': "[ZH]",
37
- 'ja': "[JA]",
38
- "en": "[EN]",
39
- }
40
-
41
- lang2code = {
42
- 'zh': 0,
43
- 'ja': 1,
44
- "en": 2,
45
- }
46
-
47
- token2lang = {
48
- '[ZH]': "zh",
49
- '[JA]': "ja",
50
- "[EN]": "en",
51
- }
52
-
53
- code2lang = {
54
- 0: 'zh',
55
- 1: 'ja',
56
- 2: "en",
57
- }
58
 
 
59
 
 
60
 
61
- langdropdown2token = {
62
- 'English': "[EN]",
63
- '中文': "[ZH]",
64
- '日本語': "[JA]",
65
- 'mix': "",
66
- }
67
 
68
  text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
69
  text_collater = get_text_token_collater()
@@ -74,30 +55,33 @@ if torch.cuda.is_available():
74
 
75
  # VALL-E-X model
76
  model = VALLE(
77
- 1024,
78
- 16,
79
- 12,
80
- norm_first=True,
81
- add_prenet=False,
82
- prefix_mode=1,
83
- share_embedding=True,
84
- nar_scale_factor=1.0,
85
- prepend_bos=True,
86
- num_quantizers=8,
87
- )
88
  checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
89
  missing_keys, unexpected_keys = model.load_state_dict(
90
  checkpoint["model"], strict=True
91
  )
92
  assert not missing_keys
93
- model.to('cpu')
94
  model.eval()
95
 
96
  # Encodec model
97
  audio_tokenizer = AudioTokenizer(device)
98
 
99
  # ASR
100
- whisper_model = whisper.load_model("medium")
 
 
 
 
101
 
102
  def clear_prompts():
103
  try:
@@ -136,24 +120,38 @@ def transcribe_one(model, audio_path):
136
  text_pr += "."
137
  return lang, text_pr
138
 
139
- def make_npz_prompt(name, uploaded_audio, recorded_audio):
140
  global model, text_collater, text_tokenizer, audio_tokenizer
141
  clear_prompts()
142
  audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio
143
  sr, wav_pr = audio_prompt
144
- wav_pr = torch.FloatTensor(wav_pr) / 32768
 
 
 
 
 
145
  if wav_pr.size(-1) == 2:
146
  wav_pr = wav_pr.mean(-1, keepdim=False)
147
- text_pr, lang_pr = make_prompt(name, wav_pr, sr, save=False)
148
-
 
 
 
 
 
 
 
 
149
  # tokenize audio
150
- encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr.unsqueeze(0), sr))
151
  audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
152
 
153
  # tokenize text
 
154
  text_tokens, enroll_x_lens = text_collater(
155
  [
156
- text_tokenizer.tokenize(text=f"{text_pr}".strip())
157
  ]
158
  )
159
 
@@ -166,8 +164,8 @@ def make_npz_prompt(name, uploaded_audio, recorded_audio):
166
 
167
 
168
  def make_prompt(name, wav, sr, save=True):
169
-
170
  global whisper_model
 
171
  if not isinstance(wav, torch.FloatTensor):
172
  wav = torch.tensor(wav)
173
  if wav.abs().max() > 1:
@@ -187,19 +185,41 @@ def make_prompt(name, wav, sr, save=True):
187
  os.remove(f"./prompts/{name}.wav")
188
  os.remove(f"./prompts/{name}.txt")
189
 
 
190
  torch.cuda.empty_cache()
191
  return text, lang
192
 
193
  @torch.no_grad()
194
- def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
 
 
195
  global model, text_collater, text_tokenizer, audio_tokenizer
 
196
  audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
197
  sr, wav_pr = audio_prompt
198
- wav_pr = torch.FloatTensor(wav_pr)/32768
 
 
 
 
 
199
  if wav_pr.size(-1) == 2:
200
  wav_pr = wav_pr.mean(-1, keepdim=False)
201
- text_pr, lang_pr = make_prompt(str(random.randint(0, 10000000)), wav_pr, sr, save=False)
202
- lang_token = langdropdown2token[language]
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  lang = token2lang[lang_token]
204
  text = lang_token + text + lang_token
205
 
@@ -207,24 +227,28 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
207
  model.to(device)
208
 
209
  # tokenize audio
210
- encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr.unsqueeze(0), sr))
211
  audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
212
 
213
  # tokenize text
214
  logging.info(f"synthesize text: {text}")
 
215
  text_tokens, text_tokens_lens = text_collater(
216
  [
217
- text_tokenizer.tokenize(text=f"{text_pr}{text}".strip())
218
  ]
219
  )
220
 
221
  enroll_x_lens = None
222
  if text_pr:
223
- _, enroll_x_lens = text_collater(
 
224
  [
225
- text_tokenizer.tokenize(text=f"{text_pr}".strip())
226
  ]
227
  )
 
 
228
  lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
229
  encoded_frames = model.inference(
230
  text_tokens.to(device),
@@ -234,7 +258,7 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
234
  top_k=-100,
235
  temperature=1,
236
  prompt_language=lang_pr,
237
- text_language=lang,
238
  )
239
  samples = audio_tokenizer.decode(
240
  [(encoded_frames.transpose(2, 1), None)]
@@ -248,17 +272,24 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt):
248
  return message, (24000, samples[0][0].cpu().numpy())
249
 
250
  @torch.no_grad()
251
- def infer_from_prompt(text, language, accent, prompt_file):
252
- # onload model
253
- model.to(device)
254
  clear_prompts()
 
255
  # text to synthesize
256
- lang_token = langdropdown2token[language]
 
 
 
257
  lang = token2lang[lang_token]
258
  text = lang_token + text + lang_token
259
 
260
  # load prompt
261
- prompt_data = np.load(prompt_file.name)
 
 
 
262
  audio_prompts = prompt_data['audio_tokens']
263
  text_prompts = prompt_data['text_tokens']
264
  lang_pr = prompt_data['lang_code']
@@ -270,9 +301,10 @@ def infer_from_prompt(text, language, accent, prompt_file):
270
 
271
  enroll_x_lens = text_prompts.shape[-1]
272
  logging.info(f"synthesize text: {text}")
 
273
  text_tokens, text_tokens_lens = text_collater(
274
  [
275
- text_tokenizer.tokenize(text=f"_{text}".strip())
276
  ]
277
  )
278
  text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
@@ -287,13 +319,11 @@ def infer_from_prompt(text, language, accent, prompt_file):
287
  top_k=-100,
288
  temperature=1,
289
  prompt_language=lang_pr,
290
- text_language=lang,
291
  )
292
  samples = audio_tokenizer.decode(
293
  [(encoded_frames.transpose(2, 1), None)]
294
  )
295
-
296
- # offload model
297
  model.to('cpu')
298
  torch.cuda.empty_cache()
299
 
@@ -301,6 +331,144 @@ def infer_from_prompt(text, language, accent, prompt_file):
301
  return message, (24000, samples[0][0].cpu().numpy())
302
 
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  def main():
305
  app = gr.Blocks()
306
  with app:
@@ -312,9 +480,12 @@ def main():
312
 
313
  textbox = gr.TextArea(label="Text",
314
  placeholder="Type your sentence here",
315
- value="VALLEX can synthesize personalized speech in another language for a monolingual speaker.", elem_id=f"tts-input")
316
- language_dropdown = gr.Dropdown(choices=['English', '中文', '日本語'], value='English', label='language')
317
  accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
 
 
 
318
  upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
319
  record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
320
  with gr.Column():
@@ -322,7 +493,7 @@ def main():
322
  audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
323
  btn = gr.Button("Generate!")
324
  btn.click(infer_from_audio,
325
- inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt],
326
  outputs=[text_output, audio_output])
327
  textbox_mp = gr.TextArea(label="Prompt name",
328
  placeholder="Name your prompt here",
@@ -330,7 +501,7 @@ def main():
330
  btn_mp = gr.Button("Make prompt!")
331
  prompt_output = gr.File(interactive=False)
332
  btn_mp.click(make_npz_prompt,
333
- inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt],
334
  outputs=[text_output, prompt_output])
335
  with gr.Tab("Make prompt"):
336
  gr.Markdown(make_prompt_md)
@@ -339,6 +510,10 @@ def main():
339
  textbox2 = gr.TextArea(label="Prompt name",
340
  placeholder="Name your prompt here",
341
  value="prompt_1", elem_id=f"prompt-name")
 
 
 
 
342
  upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
343
  record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
344
  with gr.Column():
@@ -346,7 +521,7 @@ def main():
346
  prompt_output_2 = gr.File(interactive=False)
347
  btn_2 = gr.Button("Make!")
348
  btn_2.click(make_npz_prompt,
349
- inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2],
350
  outputs=[text_output_2, prompt_output_2])
351
  with gr.Tab("Infer from prompt"):
352
  gr.Markdown(infer_from_prompt_md)
@@ -354,19 +529,40 @@ def main():
354
  with gr.Column():
355
  textbox_3 = gr.TextArea(label="Text",
356
  placeholder="Type your sentence here",
357
- value="VALLEX can synthesize personalized speech in another language for a monolingual speaker.", elem_id=f"tts-input")
358
- language_dropdown_3 = gr.Dropdown(choices=['English', '中文', '日本語'], value='English',
359
  label='language')
360
  accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
361
  label='accent')
 
362
  prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
363
  with gr.Column():
364
  text_output_3 = gr.Textbox(label="Message")
365
  audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
366
  btn_3 = gr.Button("Generate!")
367
  btn_3.click(infer_from_prompt,
368
- inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, prompt_file],
369
  outputs=[text_output_3, audio_output_3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
  app.launch()
372
 
 
4
  import pathlib
5
  import time
6
  import tempfile
7
+ import platform
8
+ if platform.system().lower() == 'windows':
9
+ temp = pathlib.PosixPath
10
+ pathlib.PosixPath = pathlib.WindowsPath
11
+ elif platform.system().lower() == 'linux':
12
+ temp = pathlib.WindowsPath
13
+ pathlib.WindowsPath = pathlib.PosixPath
14
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
15
+
16
+ import langid
17
+ langid.set_languages(['en', 'zh', 'ja'])
18
+
19
  import torch
20
  import torchaudio
21
  import random
 
30
  from models.vallex import VALLE
31
  from utils.g2p import PhonemeBpeTokenizer
32
  from descriptions import *
33
+ from macros import *
34
 
35
  import gradio as gr
36
  import whisper
37
+ import multiprocessing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ thread_count = multiprocessing.cpu_count()
40
 
41
+ print("Use",thread_count,"cpu cores for computing")
42
 
43
+ torch.set_num_threads(thread_count)
44
+ torch.set_num_interop_threads(thread_count)
45
+ torch._C._jit_set_profiling_executor(False)
46
+ torch._C._jit_set_profiling_mode(False)
47
+ torch._C._set_graph_executor_optimize(False)
 
48
 
49
  text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
50
  text_collater = get_text_token_collater()
 
55
 
56
  # VALL-E-X model
57
  model = VALLE(
58
+ N_DIM,
59
+ NUM_HEAD,
60
+ NUM_LAYERS,
61
+ norm_first=True,
62
+ add_prenet=False,
63
+ prefix_mode=PREFIX_MODE,
64
+ share_embedding=True,
65
+ nar_scale_factor=1.0,
66
+ prepend_bos=True,
67
+ num_quantizers=NUM_QUANTIZERS,
68
+ )
69
  checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
70
  missing_keys, unexpected_keys = model.load_state_dict(
71
  checkpoint["model"], strict=True
72
  )
73
  assert not missing_keys
 
74
  model.eval()
75
 
76
  # Encodec model
77
  audio_tokenizer = AudioTokenizer(device)
78
 
79
  # ASR
80
+ whisper_model = whisper.load_model("medium").cpu()
81
+
82
+ # Voice Presets
83
+ preset_list = os.walk("./presets/").__next__()[2]
84
+ preset_list = [preset[:-4] for preset in preset_list if preset.endswith(".npz")]
85
 
86
  def clear_prompts():
87
  try:
 
120
  text_pr += "."
121
  return lang, text_pr
122
 
123
+ def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
124
  global model, text_collater, text_tokenizer, audio_tokenizer
125
  clear_prompts()
126
  audio_prompt = uploaded_audio if uploaded_audio is not None else recorded_audio
127
  sr, wav_pr = audio_prompt
128
+ if len(wav_pr) / sr > 15:
129
+ return "Rejected, Audio too long (should be less than 15 seconds)", None
130
+ if not isinstance(wav_pr, torch.FloatTensor):
131
+ wav_pr = torch.FloatTensor(wav_pr)
132
+ if wav_pr.abs().max() > 1:
133
+ wav_pr /= wav_pr.abs().max()
134
  if wav_pr.size(-1) == 2:
135
  wav_pr = wav_pr.mean(-1, keepdim=False)
136
+ if wav_pr.ndim == 1:
137
+ wav_pr = wav_pr.unsqueeze(0)
138
+ assert wav_pr.ndim and wav_pr.size(0) == 1
139
+
140
+ if transcript_content == "":
141
+ text_pr, lang_pr = make_prompt(name, wav_pr, sr, save=False)
142
+ else:
143
+ lang_pr = langid.classify(str(transcript_content))[0]
144
+ lang_token = lang2token[lang_pr]
145
+ text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
146
  # tokenize audio
147
+ encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
148
  audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
149
 
150
  # tokenize text
151
+ phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
152
  text_tokens, enroll_x_lens = text_collater(
153
  [
154
+ phonemes
155
  ]
156
  )
157
 
 
164
 
165
 
166
  def make_prompt(name, wav, sr, save=True):
 
167
  global whisper_model
168
+ whisper_model.to(device)
169
  if not isinstance(wav, torch.FloatTensor):
170
  wav = torch.tensor(wav)
171
  if wav.abs().max() > 1:
 
185
  os.remove(f"./prompts/{name}.wav")
186
  os.remove(f"./prompts/{name}.txt")
187
 
188
+ whisper_model.cpu()
189
  torch.cuda.empty_cache()
190
  return text, lang
191
 
192
  @torch.no_grad()
193
+ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt, transcript_content):
194
+ if len(text) > 150:
195
+ return "Rejected, Text too long (should be less than 150 characters)", None
196
  global model, text_collater, text_tokenizer, audio_tokenizer
197
+ model.to(device)
198
  audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
199
  sr, wav_pr = audio_prompt
200
+ if len(wav_pr) / sr > 15:
201
+ return "Rejected, Audio too long (should be less than 15 seconds)", None
202
+ if not isinstance(wav_pr, torch.FloatTensor):
203
+ wav_pr = torch.FloatTensor(wav_pr)
204
+ if wav_pr.abs().max() > 1:
205
+ wav_pr /= wav_pr.abs().max()
206
  if wav_pr.size(-1) == 2:
207
  wav_pr = wav_pr.mean(-1, keepdim=False)
208
+ if wav_pr.ndim == 1:
209
+ wav_pr = wav_pr.unsqueeze(0)
210
+ assert wav_pr.ndim and wav_pr.size(0) == 1
211
+
212
+ if transcript_content == "":
213
+ text_pr, lang_pr = make_prompt('dummy', wav_pr, sr, save=False)
214
+ else:
215
+ lang_pr = langid.classify(str(transcript_content))[0]
216
+ lang_token = lang2token[lang_pr]
217
+ text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
218
+
219
+ if language == 'auto-detect':
220
+ lang_token = lang2token[langid.classify(text)[0]]
221
+ else:
222
+ lang_token = langdropdown2token[language]
223
  lang = token2lang[lang_token]
224
  text = lang_token + text + lang_token
225
 
 
227
  model.to(device)
228
 
229
  # tokenize audio
230
+ encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
231
  audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
232
 
233
  # tokenize text
234
  logging.info(f"synthesize text: {text}")
235
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
236
  text_tokens, text_tokens_lens = text_collater(
237
  [
238
+ phone_tokens
239
  ]
240
  )
241
 
242
  enroll_x_lens = None
243
  if text_pr:
244
+ text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
245
+ text_prompts, enroll_x_lens = text_collater(
246
  [
247
+ text_prompts
248
  ]
249
  )
250
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
251
+ text_tokens_lens += enroll_x_lens
252
  lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
253
  encoded_frames = model.inference(
254
  text_tokens.to(device),
 
258
  top_k=-100,
259
  temperature=1,
260
  prompt_language=lang_pr,
261
+ text_language=langs if accent == "no-accent" else lang,
262
  )
263
  samples = audio_tokenizer.decode(
264
  [(encoded_frames.transpose(2, 1), None)]
 
272
  return message, (24000, samples[0][0].cpu().numpy())
273
 
274
  @torch.no_grad()
275
+ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
276
+ if len(text) > 150:
277
+ return "Rejected, Text too long (should be less than 150 characters)", None
278
  clear_prompts()
279
+ model.to(device)
280
  # text to synthesize
281
+ if language == 'auto-detect':
282
+ lang_token = lang2token[langid.classify(text)[0]]
283
+ else:
284
+ lang_token = langdropdown2token[language]
285
  lang = token2lang[lang_token]
286
  text = lang_token + text + lang_token
287
 
288
  # load prompt
289
+ if prompt_file is not None:
290
+ prompt_data = np.load(prompt_file.name)
291
+ else:
292
+ prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
293
  audio_prompts = prompt_data['audio_tokens']
294
  text_prompts = prompt_data['text_tokens']
295
  lang_pr = prompt_data['lang_code']
 
301
 
302
  enroll_x_lens = text_prompts.shape[-1]
303
  logging.info(f"synthesize text: {text}")
304
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
305
  text_tokens, text_tokens_lens = text_collater(
306
  [
307
+ phone_tokens
308
  ]
309
  )
310
  text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
 
319
  top_k=-100,
320
  temperature=1,
321
  prompt_language=lang_pr,
322
+ text_language=langs if accent == "no-accent" else lang,
323
  )
324
  samples = audio_tokenizer.decode(
325
  [(encoded_frames.transpose(2, 1), None)]
326
  )
 
 
327
  model.to('cpu')
328
  torch.cuda.empty_cache()
329
 
 
331
  return message, (24000, samples[0][0].cpu().numpy())
332
 
333
 
334
+ from utils.sentence_cutter import split_text_into_sentences
335
+ @torch.no_grad()
336
+ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='no-accent'):
337
+ """
338
+ For long audio generation, two modes are available.
339
+ fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
340
+ sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
341
+ """
342
+ if len(text) > 1000:
343
+ return "Rejected, Text too long (should be less than 1000 characters)", None
344
+ mode = 'fixed-prompt'
345
+ global model, audio_tokenizer, text_tokenizer, text_collater
346
+ model.to(device)
347
+ if (prompt is None or prompt == "") and preset_prompt == "":
348
+ mode = 'sliding-window' # If no prompt is given, use sliding-window mode
349
+ sentences = split_text_into_sentences(text)
350
+ # detect language
351
+ if language == "auto-detect":
352
+ language = langid.classify(text)[0]
353
+ else:
354
+ language = token2lang[langdropdown2token[language]]
355
+
356
+ # if initial prompt is given, encode it
357
+ if prompt is not None and prompt != "":
358
+ # load prompt
359
+ prompt_data = np.load(prompt.name)
360
+ audio_prompts = prompt_data['audio_tokens']
361
+ text_prompts = prompt_data['text_tokens']
362
+ lang_pr = prompt_data['lang_code']
363
+ lang_pr = code2lang[int(lang_pr)]
364
+
365
+ # numpy to tensor
366
+ audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
367
+ text_prompts = torch.tensor(text_prompts).type(torch.int32)
368
+ elif preset_prompt is not None and preset_prompt != "":
369
+ prompt_data = np.load(os.path.join("./presets/", f"{preset_prompt}.npz"))
370
+ audio_prompts = prompt_data['audio_tokens']
371
+ text_prompts = prompt_data['text_tokens']
372
+ lang_pr = prompt_data['lang_code']
373
+ lang_pr = code2lang[int(lang_pr)]
374
+
375
+ # numpy to tensor
376
+ audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
377
+ text_prompts = torch.tensor(text_prompts).type(torch.int32)
378
+ else:
379
+ audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
380
+ text_prompts = torch.zeros([1, 0]).type(torch.int32)
381
+ lang_pr = language if language != 'mix' else 'en'
382
+ if mode == 'fixed-prompt':
383
+ complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
384
+ for text in sentences:
385
+ text = text.replace("\n", "").strip(" ")
386
+ if text == "":
387
+ continue
388
+ lang_token = lang2token[language]
389
+ lang = token2lang[lang_token]
390
+ text = lang_token + text + lang_token
391
+
392
+ enroll_x_lens = text_prompts.shape[-1]
393
+ logging.info(f"synthesize text: {text}")
394
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
395
+ text_tokens, text_tokens_lens = text_collater(
396
+ [
397
+ phone_tokens
398
+ ]
399
+ )
400
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
401
+ text_tokens_lens += enroll_x_lens
402
+ # accent control
403
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
404
+ encoded_frames = model.inference(
405
+ text_tokens.to(device),
406
+ text_tokens_lens.to(device),
407
+ audio_prompts,
408
+ enroll_x_lens=enroll_x_lens,
409
+ top_k=-100,
410
+ temperature=1,
411
+ prompt_language=lang_pr,
412
+ text_language=langs if accent == "no-accent" else lang,
413
+ )
414
+ complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
415
+ samples = audio_tokenizer.decode(
416
+ [(complete_tokens, None)]
417
+ )
418
+ model.to('cpu')
419
+ message = f"Cut into {len(sentences)} sentences"
420
+ return message, (24000, samples[0][0].cpu().numpy())
421
+ elif mode == "sliding-window":
422
+ complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
423
+ original_audio_prompts = audio_prompts
424
+ original_text_prompts = text_prompts
425
+ for text in sentences:
426
+ text = text.replace("\n", "").strip(" ")
427
+ if text == "":
428
+ continue
429
+ lang_token = lang2token[language]
430
+ lang = token2lang[lang_token]
431
+ text = lang_token + text + lang_token
432
+
433
+ enroll_x_lens = text_prompts.shape[-1]
434
+ logging.info(f"synthesize text: {text}")
435
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
436
+ text_tokens, text_tokens_lens = text_collater(
437
+ [
438
+ phone_tokens
439
+ ]
440
+ )
441
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
442
+ text_tokens_lens += enroll_x_lens
443
+ # accent control
444
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
445
+ encoded_frames = model.inference(
446
+ text_tokens.to(device),
447
+ text_tokens_lens.to(device),
448
+ audio_prompts,
449
+ enroll_x_lens=enroll_x_lens,
450
+ top_k=-100,
451
+ temperature=1,
452
+ prompt_language=lang_pr,
453
+ text_language=langs if accent == "no-accent" else lang,
454
+ )
455
+ complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
456
+ if torch.rand(1) < 1.0:
457
+ audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
458
+ text_prompts = text_tokens[:, enroll_x_lens:]
459
+ else:
460
+ audio_prompts = original_audio_prompts
461
+ text_prompts = original_text_prompts
462
+ samples = audio_tokenizer.decode(
463
+ [(complete_tokens, None)]
464
+ )
465
+ model.to('cpu')
466
+ message = f"Cut into {len(sentences)} sentences"
467
+ return message, (24000, samples[0][0].cpu().numpy())
468
+ else:
469
+ raise ValueError(f"No such mode {mode}")
470
+
471
+
472
  def main():
473
  app = gr.Blocks()
474
  with app:
 
480
 
481
  textbox = gr.TextArea(label="Text",
482
  placeholder="Type your sentence here",
483
+ value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
484
+ language_dropdown = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='English', label='auto-detect')
485
  accent_dropdown = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent', label='accent')
486
+ textbox_transcript = gr.TextArea(label="Transcript",
487
+ placeholder="Write transcript here. (leave empty to use whisper)",
488
+ value="", elem_id=f"prompt-name")
489
  upload_audio_prompt = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
490
  record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
491
  with gr.Column():
 
493
  audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
494
  btn = gr.Button("Generate!")
495
  btn.click(infer_from_audio,
496
+ inputs=[textbox, language_dropdown, accent_dropdown, upload_audio_prompt, record_audio_prompt, textbox_transcript],
497
  outputs=[text_output, audio_output])
498
  textbox_mp = gr.TextArea(label="Prompt name",
499
  placeholder="Name your prompt here",
 
501
  btn_mp = gr.Button("Make prompt!")
502
  prompt_output = gr.File(interactive=False)
503
  btn_mp.click(make_npz_prompt,
504
+ inputs=[textbox_mp, upload_audio_prompt, record_audio_prompt, textbox_transcript],
505
  outputs=[text_output, prompt_output])
506
  with gr.Tab("Make prompt"):
507
  gr.Markdown(make_prompt_md)
 
510
  textbox2 = gr.TextArea(label="Prompt name",
511
  placeholder="Name your prompt here",
512
  value="prompt_1", elem_id=f"prompt-name")
513
+ # 添加选择语言和输入台本的地方
514
+ textbox_transcript2 = gr.TextArea(label="Transcript",
515
+ placeholder="Write transcript here. (leave empty to use whisper)",
516
+ value="", elem_id=f"prompt-name")
517
  upload_audio_prompt_2 = gr.Audio(label='uploaded audio prompt', source='upload', interactive=True)
518
  record_audio_prompt_2 = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
519
  with gr.Column():
 
521
  prompt_output_2 = gr.File(interactive=False)
522
  btn_2 = gr.Button("Make!")
523
  btn_2.click(make_npz_prompt,
524
+ inputs=[textbox2, upload_audio_prompt_2, record_audio_prompt_2, textbox_transcript2],
525
  outputs=[text_output_2, prompt_output_2])
526
  with gr.Tab("Infer from prompt"):
527
  gr.Markdown(infer_from_prompt_md)
 
529
  with gr.Column():
530
  textbox_3 = gr.TextArea(label="Text",
531
  placeholder="Type your sentence here",
532
+ value="Welcome back, Master. What can I do for you today?", elem_id=f"tts-input")
533
+ language_dropdown_3 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語', 'Mix'], value='auto-detect',
534
  label='language')
535
  accent_dropdown_3 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
536
  label='accent')
537
+ preset_dropdown_3 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
538
  prompt_file = gr.File(file_count='single', file_types=['.npz'], interactive=True)
539
  with gr.Column():
540
  text_output_3 = gr.Textbox(label="Message")
541
  audio_output_3 = gr.Audio(label="Output Audio", elem_id="tts-audio")
542
  btn_3 = gr.Button("Generate!")
543
  btn_3.click(infer_from_prompt,
544
+ inputs=[textbox_3, language_dropdown_3, accent_dropdown_3, preset_dropdown_3, prompt_file],
545
  outputs=[text_output_3, audio_output_3])
546
+ with gr.Tab("Infer long text"):
547
+ gr.Markdown("This is a long text generation demo. You can use this to generate long audio. ")
548
+ with gr.Row():
549
+ with gr.Column():
550
+ textbox_4 = gr.TextArea(label="Text",
551
+ placeholder="Type your sentence here",
552
+ value=long_text_example, elem_id=f"tts-input")
553
+ language_dropdown_4 = gr.Dropdown(choices=['auto-detect', 'English', '中文', '日本語'], value='auto-detect',
554
+ label='language')
555
+ accent_dropdown_4 = gr.Dropdown(choices=['no-accent', 'English', '中文', '日本語'], value='no-accent',
556
+ label='accent')
557
+ preset_dropdown_4 = gr.Dropdown(choices=preset_list, value=None, label='Voice preset')
558
+ prompt_file_4 = gr.File(file_count='single', file_types=['.npz'], interactive=True)
559
+ with gr.Column():
560
+ text_output_4 = gr.TextArea(label="Message")
561
+ audio_output_4 = gr.Audio(label="Output Audio", elem_id="tts-audio")
562
+ btn_4 = gr.Button("Generate!")
563
+ btn_4.click(infer_long_text,
564
+ inputs=[textbox_4, preset_dropdown_4, prompt_file_4, language_dropdown_4, accent_dropdown_4],
565
+ outputs=[text_output_4, audio_output_4])
566
 
567
  app.launch()
568
 
descriptions.py CHANGED
@@ -1,8 +1,5 @@
1
  top_md = """
2
  # VALL-E X
3
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing)
4
- [![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)](https://github.com/Plachtaa/vallex-webui)
5
- Unofficial implementation of Microsoft's [VALL-E X](https://arxiv.org/pdf/2303.03926).<br>
6
  VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
7
  an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
8
  This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
@@ -24,4 +21,6 @@ Get a `.npz` file as the encoded audio prompt. Use it by **"Infer with prompt"**
24
  infer_from_prompt_md = """
25
  Faster than **"Infer from audio"**.<br>
26
  You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
27
- """
 
 
 
1
  top_md = """
2
  # VALL-E X
 
 
 
3
  VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
4
  an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
5
  This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
 
21
  infer_from_prompt_md = """
22
  Faster than **"Infer from audio"**.<br>
23
  You need to **"Make prompt"** first, and upload the encoded prompt (a `.npz` file)
24
+ """
25
+
26
+ long_text_example = "Just a few years ago, there were no legions of deep learning scientists developing intelligent products and services at major companies and startups. When we entered the field, machine learning did not command headlines in daily newspapers. Our parents had no idea what machine learning was, let alone why we might prefer it to a career in medicine or law. Machine learning was a blue skies academic discipline whose industrial significance was limited to a narrow set of real-world applications, including speech recognition and computer vision. Moreover, many of these applications required so much domain knowledge that they were often regarded as entirely separate areas for which machine learning was one small component. At that time, neural networks—the predecessors of the deep learning methods that we focus on in this book—were generally regarded as outmoded."
macros.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NUM_LAYERS = 12
2
+ NUM_HEAD = 16
3
+ N_DIM = 1024
4
+ PREFIX_MODE = 1
5
+ NUM_QUANTIZERS = 8
6
+ SAMPLE_RATE = 24000
7
+
8
+ lang2token = {
9
+ 'zh': "[ZH]",
10
+ 'ja': "[JA]",
11
+ "en": "[EN]",
12
+ 'mix': "",
13
+ }
14
+
15
+ lang2code = {
16
+ 'zh': 0,
17
+ 'ja': 1,
18
+ "en": 2,
19
+ }
20
+
21
+ token2lang = {
22
+ '[ZH]': "zh",
23
+ '[JA]': "ja",
24
+ "[EN]": "en",
25
+ "": "mix"
26
+ }
27
+
28
+ code2lang = {
29
+ 0: 'zh',
30
+ 1: 'ja',
31
+ 2: "en",
32
+ }
33
+
34
+ langdropdown2token = {
35
+ 'English': "[EN]",
36
+ '中文': "[ZH]",
37
+ '日本語': "[JA]",
38
+ 'Mix': "",
39
+ }
presets/acou_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:470ce66fc24a2d14e162343381f7d93ef0a3af51edf5fd37240c21f492b4e769
3
+ size 15650
presets/acou_2.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec1c5328751cadeed5356d4264759799ad96d33ea8dd4f8a3d0a80dd8ddb0e74
3
+ size 15426
presets/acou_3.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f241b094a32b3f542e74374183c6d15e8b70ae73ceeafb11bfd4ee6b8b4a3a
3
+ size 15410
presets/acou_4.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52b96f32863f13f84cf7ac4a27d2bc95cea70c350a037f4d1890b20b8da9501e
3
+ size 15506
presets/amused.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df3e882f3a62805b9aaf300d81822cd4eddeafee480503b7b78e32be2085fb11
3
+ size 20882
presets/anger.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:959cec6dc0b30219db0d70cdd165fe00bbdc098165cf9d67ccdd1ecf7a5da5be
3
+ size 22090
presets/babara.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8106b2a98c3f70587f23ab46ed5bf73b1c9a770481c3620ab140bd3256010376
3
+ size 11526
presets/bronya_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02eaada2c3d58866c813887ed9f871587ef5a7e976abc23382ce46a17b208001
3
+ size 18106
presets/dingzhen.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d19167c65eefef5e42dfaa1919ff5149ca0a93cb052396a47d1f42f9865f5f8
3
+ size 18154
presets/disgust.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4443f0a395072700f2ec6101dbf2ad9d28968aa3e5809e384ea131832f894d7f
3
+ size 39386
presets/emo_amused.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38be2ea16dc79beae68b6c885d99d4dad516acbd88ed5ed6991dd97301f2f30b
3
+ size 15378
presets/emo_anger.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3261c3bdd5b7b4be9783d9293ee3d871be9d9d791f2b3a8bf62a1a0ee0ed93e6
3
+ size 15434
presets/emo_neutral.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2188c4154692316ed7c0edee3aa3dd8678be36f355ee2b8c8a3a6412c3673ba9
3
+ size 15578
presets/emo_sleepy.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a53255890beaf4ed339e1967f0837fdb87c34c9f7e18bf77cd4b08eba176963
3
+ size 15370
presets/en2zh_tts_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d4de4ed055448ea54f7b40091afae565197f960d954279035ac537ea5a01bc4
3
+ size 44354
presets/en2zh_tts_2.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcc066ea104daa27d1552fe76574d09359d56fa892241581cc19e931a696eca9
3
+ size 24178
presets/en2zh_tts_3.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7468944e6d0ed7f2da033e8037be07dbafc76bd1ed7c0f5996d85ff45aacda11
3
+ size 21410
presets/en2zh_tts_4.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd8d0914e74769114310e9504d68d6b7b0c6aacd46763478cbfd4f9631ad54a
3
+ size 43826
presets/fuxuan_2.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17b90388d179ae309e1f577c28c3f10d9bed73c6ccbffdd829c00568eb3941e6
3
+ size 50330
presets/librispeech_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:415b244e43b45291fd651d71f15bb7a31c244e2054988c436f6bbc04465c6099
3
+ size 15650
presets/librispeech_2.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd74e77370248b025321b9dbae25b1572f13f98da63255e384d382d2b0c78227
3
+ size 15418
presets/librispeech_3.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eceb3f4cc0f3a8856b5e3b5f1ca28c428d75305b1452da1ecf4013bc358ccaa
3
+ size 15634
presets/librispeech_4.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3939dde39f5e65bc01f5eba9acb7b8329465aaca3c38edf1b240aa714e687960
3
+ size 15594
presets/neutral.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8a63993526ffdc788a711b512d07a8b1c816151a1edb63913d0bfb48c2ea380
3
+ size 21050
presets/paimon_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:452d5e0cd3a060db521bd65a16af818a6177f357801402aa5581eceb2c24039a
3
+ size 13762
presets/rosalia.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af87ebe283bbb7b527c6c0ff0a02a315416485677fe23330040c2766fa9af919
3
+ size 11414
presets/seel.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44ad2e900df3625f9753e949dc5a7d8479c4091e24cb18cbf46e34e29498d952
3
+ size 13554
presets/sleepiness.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0f866a278a10c7b6b494fb62589a9d8fef778ccf272df3b0d5510f45b243b5c
3
+ size 33218
presets/vctk_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c9df2ea8c2bc919c0ac50f8e05950bb4e831de69b33a7fb12d584da5b2512f2
3
+ size 15530
presets/vctk_2.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc84744435a304b3e700b8b1ab94c3b891db3056bd55a0f9dd99eff284016efa
3
+ size 15458
presets/vctk_3.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec0d528c6ae9c8f32b02ca6b57aa565b9fe63f401fd04f2632ed7e536699b9ac
3
+ size 15450
presets/vctk_4.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ff2b71254ae00be6e42ad206c7616d168bd41582837e9eeb4d6cd669bd0b140
3
+ size 15330
presets/yaesakura.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b388a18d286b4ba13d45bae373a716c0010dc40ae9c940d53b5a04cbc64e95ff
3
+ size 12442
presets/zh2en_tts_1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07bff150ad145f9b06f0e7cbf9b0ee4d9e926600efa0d129bd831c8b2993c2b0
3
+ size 23546
presets/zh2en_tts_2.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0257d0782578c7813c3f43b5e93c0e681f9ea42fe76775d5a4f4fea64609b03e
3
+ size 20170
presets/zh2en_tts_3.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5da48e060d15f391767bffe1d528bfbc782a562413feed2e9bd2cafa82bf644a
3
+ size 17906
presets/zh2en_tts_4.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda7a70ed9b03d8f1ff99d2444ea1df476a8deaf75633aa3b3f6cf3f45ae7e5e
3
+ size 33682
requirements.txt CHANGED
@@ -1,8 +1,10 @@
 
1
  numpy
2
- torch
3
- torchaudio
4
  tokenizers
5
  encodec
 
6
  unidecode
7
  pyopenjtalk
8
  pypinyin
@@ -11,4 +13,8 @@ cn2an
11
  jieba
12
  eng_to_ipa
13
  jieba
14
- openai-whisper
 
 
 
 
 
1
+ soundfile
2
  numpy
3
+ torch==2.0.1
4
+ torchvision==0.15.2
5
  tokenizers
6
  encodec
7
+ langid
8
  unidecode
9
  pyopenjtalk
10
  pypinyin
 
13
  jieba
14
  eng_to_ipa
15
  jieba
16
+ SudachiPy
17
+ openai-whisper
18
+ phonemizer
19
+ matplotlib
20
+ gradio
utils/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (564 Bytes). View file
 
utils/g2p/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (3.02 kB). View file
 
utils/g2p/__pycache__/cleaners.cpython-38.pyc ADDED
Binary file (1.95 kB). View file
 
utils/g2p/__pycache__/english.cpython-38.pyc ADDED
Binary file (4.85 kB). View file
 
utils/g2p/__pycache__/japanese.cpython-38.pyc ADDED
Binary file (4.44 kB). View file
 
utils/g2p/__pycache__/mandarin.cpython-38.pyc ADDED
Binary file (6.37 kB). View file
 
utils/g2p/__pycache__/symbols.cpython-38.pyc ADDED
Binary file (434 Bytes). View file
 
utils/generation.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gdown
4
+ import logging
5
+ import langid
6
+ langid.set_languages(['en', 'zh', 'ja'])
7
+
8
+ import pathlib
9
+ import platform
10
+ if platform.system().lower() == 'windows':
11
+ temp = pathlib.PosixPath
12
+ pathlib.PosixPath = pathlib.WindowsPath
13
+ elif platform.system().lower() == 'linux':
14
+ temp = pathlib.WindowsPath
15
+ pathlib.WindowsPath = pathlib.PosixPath
16
+
17
+ import numpy as np
18
+ from data.tokenizer import (
19
+ AudioTokenizer,
20
+ tokenize_audio,
21
+ )
22
+ from data.collation import get_text_token_collater
23
+ from models.vallex import VALLE
24
+ from utils.g2p import PhonemeBpeTokenizer
25
+ from utils.sentence_cutter import split_text_into_sentences
26
+
27
+ from macros import *
28
+
29
+ device = torch.device("cpu")
30
+ if torch.cuda.is_available():
31
+ device = torch.device("cuda", 0)
32
+
33
+ url = 'https://drive.google.com/file/d/10gdQWvP-K_e1undkvv0p2b7SU6I4Egyl/view?usp=sharing'
34
+
35
+ checkpoints_dir = "./checkpoints/"
36
+
37
+ model_checkpoint_name = "vallex-checkpoint.pt"
38
+
39
+ model = None
40
+
41
+ codec = None
42
+
43
+ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
44
+ text_collater = get_text_token_collater()
45
+
46
+ def preload_models():
47
+ global model, codec
48
+ if not os.path.exists(checkpoints_dir): os.mkdir(checkpoints_dir)
49
+ if not os.path.exists(os.path.join(checkpoints_dir, model_checkpoint_name)):
50
+ gdown.download(id="10gdQWvP-K_e1undkvv0p2b7SU6I4Egyl", output=os.path.join(checkpoints_dir, model_checkpoint_name), quiet=False)
51
+ # VALL-E
52
+ model = VALLE(
53
+ N_DIM,
54
+ NUM_HEAD,
55
+ NUM_LAYERS,
56
+ norm_first=True,
57
+ add_prenet=False,
58
+ prefix_mode=PREFIX_MODE,
59
+ share_embedding=True,
60
+ nar_scale_factor=1.0,
61
+ prepend_bos=True,
62
+ num_quantizers=NUM_QUANTIZERS,
63
+ ).to(device)
64
+ checkpoint = torch.load(os.path.join(checkpoints_dir, model_checkpoint_name), map_location='cpu')
65
+ missing_keys, unexpected_keys = model.load_state_dict(
66
+ checkpoint["model"], strict=True
67
+ )
68
+ assert not missing_keys
69
+ model.eval()
70
+
71
+ # Encodec
72
+ codec = AudioTokenizer(device)
73
+
74
+ @torch.no_grad()
75
+ def generate_audio(text, prompt=None, language='auto', accent='no-accent'):
76
+ global model, codec, text_tokenizer, text_collater
77
+ text = text.replace("\n", "").strip(" ")
78
+ # detect language
79
+ if language == "auto":
80
+ language = langid.classify(text)[0]
81
+ lang_token = lang2token[language]
82
+ lang = token2lang[lang_token]
83
+ text = lang_token + text + lang_token
84
+
85
+ # load prompt
86
+ if prompt is not None:
87
+ prompt_path = prompt
88
+ if not os.path.exists(prompt_path):
89
+ prompt_path = "./presets/" + prompt + ".npz"
90
+ if not os.path.exists(prompt_path):
91
+ prompt_path = "./customs/" + prompt + ".npz"
92
+ if not os.path.exists(prompt_path):
93
+ raise ValueError(f"Cannot find prompt {prompt}")
94
+ prompt_data = np.load(prompt_path)
95
+ audio_prompts = prompt_data['audio_tokens']
96
+ text_prompts = prompt_data['text_tokens']
97
+ lang_pr = prompt_data['lang_code']
98
+ lang_pr = code2lang[int(lang_pr)]
99
+
100
+ # numpy to tensor
101
+ audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
102
+ text_prompts = torch.tensor(text_prompts).type(torch.int32)
103
+ else:
104
+ audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
105
+ text_prompts = torch.zeros([1, 0]).type(torch.int32)
106
+ lang_pr = lang if lang != 'mix' else 'en'
107
+
108
+ enroll_x_lens = text_prompts.shape[-1]
109
+ logging.info(f"synthesize text: {text}")
110
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
111
+ text_tokens, text_tokens_lens = text_collater(
112
+ [
113
+ phone_tokens
114
+ ]
115
+ )
116
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
117
+ text_tokens_lens += enroll_x_lens
118
+ # accent control
119
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
120
+ encoded_frames = model.inference(
121
+ text_tokens.to(device),
122
+ text_tokens_lens.to(device),
123
+ audio_prompts,
124
+ enroll_x_lens=enroll_x_lens,
125
+ top_k=-100,
126
+ temperature=1,
127
+ prompt_language=lang_pr,
128
+ text_language=langs if accent == "no-accent" else lang,
129
+ )
130
+ samples = codec.decode(
131
+ [(encoded_frames.transpose(2, 1), None)]
132
+ )
133
+
134
+ return samples[0][0].cpu().numpy()
135
+
136
+ @torch.no_grad()
137
+ def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no-accent', mode='sliding-window'):
138
+ """
139
+ For long audio generation, two modes are available.
140
+ fixed-prompt: This mode will keep using the same prompt the user has provided, and generate audio sentence by sentence.
141
+ sliding-window: This mode will use the last sentence as the prompt for the next sentence, but has some concern on speaker maintenance.
142
+ """
143
+ global model, codec, text_tokenizer, text_collater
144
+ if prompt is None or prompt == "":
145
+ mode = 'sliding-window' # If no prompt is given, use sliding-window mode
146
+ sentences = split_text_into_sentences(text)
147
+ # detect language
148
+ if language == "auto":
149
+ language = langid.classify(text)[0]
150
+
151
+ # if initial prompt is given, encode it
152
+ if prompt is not None and prompt != "":
153
+ prompt_path = prompt
154
+ if not os.path.exists(prompt_path):
155
+ prompt_path = "./presets/" + prompt + ".npz"
156
+ if not os.path.exists(prompt_path):
157
+ prompt_path = "./customs/" + prompt + ".npz"
158
+ if not os.path.exists(prompt_path):
159
+ raise ValueError(f"Cannot find prompt {prompt}")
160
+ prompt_data = np.load(prompt_path)
161
+ audio_prompts = prompt_data['audio_tokens']
162
+ text_prompts = prompt_data['text_tokens']
163
+ lang_pr = prompt_data['lang_code']
164
+ lang_pr = code2lang[int(lang_pr)]
165
+
166
+ # numpy to tensor
167
+ audio_prompts = torch.tensor(audio_prompts).type(torch.int32).to(device)
168
+ text_prompts = torch.tensor(text_prompts).type(torch.int32)
169
+ else:
170
+ audio_prompts = torch.zeros([1, 0, NUM_QUANTIZERS]).type(torch.int32).to(device)
171
+ text_prompts = torch.zeros([1, 0]).type(torch.int32)
172
+ lang_pr = language if language != 'mix' else 'en'
173
+ if mode == 'fixed-prompt':
174
+ complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
175
+ for text in sentences:
176
+ text = text.replace("\n", "").strip(" ")
177
+ if text == "":
178
+ continue
179
+ lang_token = lang2token[language]
180
+ lang = token2lang[lang_token]
181
+ text = lang_token + text + lang_token
182
+
183
+ enroll_x_lens = text_prompts.shape[-1]
184
+ logging.info(f"synthesize text: {text}")
185
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
186
+ text_tokens, text_tokens_lens = text_collater(
187
+ [
188
+ phone_tokens
189
+ ]
190
+ )
191
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
192
+ text_tokens_lens += enroll_x_lens
193
+ # accent control
194
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
195
+ encoded_frames = model.inference(
196
+ text_tokens.to(device),
197
+ text_tokens_lens.to(device),
198
+ audio_prompts,
199
+ enroll_x_lens=enroll_x_lens,
200
+ top_k=-100,
201
+ temperature=1,
202
+ prompt_language=lang_pr,
203
+ text_language=langs if accent == "no-accent" else lang,
204
+ )
205
+ complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
206
+ samples = codec.decode(
207
+ [(complete_tokens, None)]
208
+ )
209
+ return samples[0][0].cpu().numpy()
210
+ elif mode == "sliding-window":
211
+ complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
212
+ original_audio_prompts = audio_prompts
213
+ original_text_prompts = text_prompts
214
+ for text in sentences:
215
+ text = text.replace("\n", "").strip(" ")
216
+ if text == "":
217
+ continue
218
+ lang_token = lang2token[language]
219
+ lang = token2lang[lang_token]
220
+ text = lang_token + text + lang_token
221
+
222
+ enroll_x_lens = text_prompts.shape[-1]
223
+ logging.info(f"synthesize text: {text}")
224
+ phone_tokens, langs = text_tokenizer.tokenize(text=f"_{text}".strip())
225
+ text_tokens, text_tokens_lens = text_collater(
226
+ [
227
+ phone_tokens
228
+ ]
229
+ )
230
+ text_tokens = torch.cat([text_prompts, text_tokens], dim=-1)
231
+ text_tokens_lens += enroll_x_lens
232
+ # accent control
233
+ lang = lang if accent == "no-accent" else token2lang[langdropdown2token[accent]]
234
+ encoded_frames = model.inference(
235
+ text_tokens.to(device),
236
+ text_tokens_lens.to(device),
237
+ audio_prompts,
238
+ enroll_x_lens=enroll_x_lens,
239
+ top_k=-100,
240
+ temperature=1,
241
+ prompt_language=lang_pr,
242
+ text_language=langs if accent == "no-accent" else lang,
243
+ )
244
+ complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
245
+ if torch.rand(1) < 0.5:
246
+ audio_prompts = encoded_frames[:, :, -NUM_QUANTIZERS:]
247
+ text_prompts = text_tokens[:, enroll_x_lens:]
248
+ else:
249
+ audio_prompts = original_audio_prompts
250
+ text_prompts = original_text_prompts
251
+ samples = codec.decode(
252
+ [(complete_tokens, None)]
253
+ )
254
+ return samples[0][0].cpu().numpy()
255
+ else:
256
+ raise ValueError(f"No such mode {mode}")
utils/prompt_making.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torchaudio
4
+ import logging
5
+ import langid
6
+ import whisper
7
+ langid.set_languages(['en', 'zh', 'ja'])
8
+
9
+ import numpy as np
10
+ from data.tokenizer import (
11
+ AudioTokenizer,
12
+ tokenize_audio,
13
+ )
14
+ from data.collation import get_text_token_collater
15
+ from utils.g2p import PhonemeBpeTokenizer
16
+
17
+ from macros import *
18
+
19
+ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
20
+ text_collater = get_text_token_collater()
21
+
22
+ device = torch.device("cpu")
23
+ if torch.cuda.is_available():
24
+ device = torch.device("cuda", 0)
25
+
26
+ codec = AudioTokenizer(device)
27
+
28
+ whisper_model = None
29
+
30
+ @torch.no_grad()
31
+ def transcribe_one(model, audio_path):
32
+ # load audio and pad/trim it to fit 30 seconds
33
+ audio = whisper.load_audio(audio_path)
34
+ audio = whisper.pad_or_trim(audio)
35
+
36
+ # make log-Mel spectrogram and move to the same device as the model
37
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
38
+
39
+ # detect the spoken language
40
+ _, probs = model.detect_language(mel)
41
+ print(f"Detected language: {max(probs, key=probs.get)}")
42
+ lang = max(probs, key=probs.get)
43
+ # decode the audio
44
+ options = whisper.DecodingOptions(temperature=1.0, best_of=5, fp16=False if device == torch.device("cpu") else True, sample_len=150)
45
+ result = whisper.decode(model, mel, options)
46
+
47
+ # print the recognized text
48
+ print(result.text)
49
+
50
+ text_pr = result.text
51
+ if text_pr.strip(" ")[-1] not in "?!.,。,?!。、":
52
+ text_pr += "."
53
+ return lang, text_pr
54
+
55
+ def make_prompt(name, audio_prompt_path, transcript=None):
56
+ global model, text_collater, text_tokenizer, codec
57
+ wav_pr, sr = torchaudio.load(audio_prompt_path)
58
+ # check length
59
+ if wav_pr.size(-1) / sr > 15:
60
+ raise ValueError(f"Prompt too long, expect length below 15 seconds, got {wav_pr / sr} seconds.")
61
+ if wav_pr.size(0) == 2:
62
+ wav_pr = wav_pr.mean(0, keepdim=True)
63
+ text_pr, lang_pr = make_transcript(name, wav_pr, sr, transcript)
64
+
65
+ # tokenize audio
66
+ encoded_frames = tokenize_audio(codec, (wav_pr, sr))
67
+ audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
68
+
69
+ # tokenize text
70
+ phonemes, langs = text_tokenizer.tokenize(text=f"{text_pr}".strip())
71
+ text_tokens, enroll_x_lens = text_collater(
72
+ [
73
+ phonemes
74
+ ]
75
+ )
76
+
77
+ message = f"Detected language: {lang_pr}\n Detected text {text_pr}\n"
78
+
79
+ # save as npz file
80
+ save_path = os.path.join("./customs/", f"{name}.npz")
81
+ np.savez(save_path, audio_tokens=audio_tokens, text_tokens=text_tokens, lang_code=lang2code[lang_pr])
82
+ logging.info(f"Successful. Prompt saved to {save_path}")
83
+
84
+
85
+ def make_transcript(name, wav, sr, transcript=None):
86
+
87
+ if not isinstance(wav, torch.FloatTensor):
88
+ wav = torch.tensor(wav)
89
+ if wav.abs().max() > 1:
90
+ wav /= wav.abs().max()
91
+ if wav.size(-1) == 2:
92
+ wav = wav.mean(-1, keepdim=False)
93
+ if wav.ndim == 1:
94
+ wav = wav.unsqueeze(0)
95
+ assert wav.ndim and wav.size(0) == 1
96
+ if transcript is None or transcript == "":
97
+ logging.info("Transcript not given, using Whisper...")
98
+ global whisper_model
99
+ if whisper_model is None:
100
+ whisper_model = whisper.load_model("medium")
101
+ whisper_model.to(device)
102
+ torchaudio.save(f"./prompts/{name}.wav", wav, sr)
103
+ lang, text = transcribe_one(whisper_model, f"./prompts/{name}.wav")
104
+ lang_token = lang2token[lang]
105
+ text = lang_token + text + lang_token
106
+ os.remove(f"./prompts/{name}.wav")
107
+ whisper_model.cpu()
108
+ else:
109
+ text = transcript
110
+ lang, _ = langid.classify(text)
111
+ lang_token = lang2token[lang]
112
+ text = lang_token + text + lang_token
113
+
114
+ torch.cuda.empty_cache()
115
+ return text, lang