Plachta commited on
Commit
b97852e
·
1 Parent(s): 982679d
Files changed (1) hide show
  1. app.py +3 -26
app.py CHANGED
@@ -35,14 +35,7 @@ from examples import *
35
 
36
  import gradio as gr
37
  import whisper
38
- import multiprocessing
39
 
40
- thread_count = multiprocessing.cpu_count()
41
-
42
- print("Use",thread_count,"cpu cores for computing")
43
-
44
- torch.set_num_threads(thread_count)
45
- torch.set_num_interop_threads(thread_count)
46
  torch._C._jit_set_profiling_executor(False)
47
  torch._C._jit_set_profiling_mode(False)
48
  torch._C._set_graph_executor_optimize(False)
@@ -66,11 +59,12 @@ model = VALLE(
66
  nar_scale_factor=1.0,
67
  prepend_bos=True,
68
  num_quantizers=NUM_QUANTIZERS,
69
- )
70
  checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
71
  missing_keys, unexpected_keys = model.load_state_dict(
72
  checkpoint["model"], strict=True
73
  )
 
74
  assert not missing_keys
75
  model.eval()
76
 
@@ -78,7 +72,7 @@ model.eval()
78
  audio_tokenizer = AudioTokenizer(device)
79
 
80
  # ASR
81
- whisper_model = whisper.load_model("medium").cpu()
82
 
83
  # Voice Presets
84
  preset_list = os.walk("./presets/").__next__()[2]
@@ -166,7 +160,6 @@ def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
166
 
167
  def make_prompt(name, wav, sr, save=True):
168
  global whisper_model
169
- whisper_model.to(device)
170
  if not isinstance(wav, torch.FloatTensor):
171
  wav = torch.tensor(wav)
172
  if wav.abs().max() > 1:
@@ -186,8 +179,6 @@ def make_prompt(name, wav, sr, save=True):
186
  os.remove(f"./prompts/{name}.wav")
187
  os.remove(f"./prompts/{name}.txt")
188
 
189
- whisper_model.cpu()
190
- torch.cuda.empty_cache()
191
  return text, lang
192
 
193
  @torch.no_grad()
@@ -195,7 +186,6 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
195
  if len(text) > 150:
196
  return "Rejected, Text too long (should be less than 150 characters)", None
197
  global model, text_collater, text_tokenizer, audio_tokenizer
198
- model.to(device)
199
  audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
200
  sr, wav_pr = audio_prompt
201
  if len(wav_pr) / sr > 15:
@@ -224,9 +214,6 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
224
  lang = token2lang[lang_token]
225
  text = lang_token + text + lang_token
226
 
227
- # onload model
228
- model.to(device)
229
-
230
  # tokenize audio
231
  encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
232
  audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
@@ -265,10 +252,6 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
265
  [(encoded_frames.transpose(2, 1), None)]
266
  )
267
 
268
- # offload model
269
- model.to('cpu')
270
- torch.cuda.empty_cache()
271
-
272
  message = f"text prompt: {text_pr}\nsythesized text: {text}"
273
  return message, (24000, samples[0][0].cpu().numpy())
274
 
@@ -277,7 +260,6 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
277
  if len(text) > 150:
278
  return "Rejected, Text too long (should be less than 150 characters)", None
279
  clear_prompts()
280
- model.to(device)
281
  # text to synthesize
282
  if language == 'auto-detect':
283
  lang_token = lang2token[langid.classify(text)[0]]
@@ -325,8 +307,6 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
325
  samples = audio_tokenizer.decode(
326
  [(encoded_frames.transpose(2, 1), None)]
327
  )
328
- model.to('cpu')
329
- torch.cuda.empty_cache()
330
 
331
  message = f"sythesized text: {text}"
332
  return message, (24000, samples[0][0].cpu().numpy())
@@ -344,7 +324,6 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
344
  return "Rejected, Text too long (should be less than 1000 characters)", None
345
  mode = 'fixed-prompt'
346
  global model, audio_tokenizer, text_tokenizer, text_collater
347
- model.to(device)
348
  if (prompt is None or prompt == "") and preset_prompt == "":
349
  mode = 'sliding-window' # If no prompt is given, use sliding-window mode
350
  sentences = split_text_into_sentences(text)
@@ -416,7 +395,6 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
416
  samples = audio_tokenizer.decode(
417
  [(complete_tokens, None)]
418
  )
419
- model.to('cpu')
420
  message = f"Cut into {len(sentences)} sentences"
421
  return message, (24000, samples[0][0].cpu().numpy())
422
  elif mode == "sliding-window":
@@ -463,7 +441,6 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
463
  samples = audio_tokenizer.decode(
464
  [(complete_tokens, None)]
465
  )
466
- model.to('cpu')
467
  message = f"Cut into {len(sentences)} sentences"
468
  return message, (24000, samples[0][0].cpu().numpy())
469
  else:
 
35
 
36
  import gradio as gr
37
  import whisper
 
38
 
 
 
 
 
 
 
39
  torch._C._jit_set_profiling_executor(False)
40
  torch._C._jit_set_profiling_mode(False)
41
  torch._C._set_graph_executor_optimize(False)
 
59
  nar_scale_factor=1.0,
60
  prepend_bos=True,
61
  num_quantizers=NUM_QUANTIZERS,
62
+ ).to(device)
63
  checkpoint = torch.load("./epoch-10.pt", map_location='cpu')
64
  missing_keys, unexpected_keys = model.load_state_dict(
65
  checkpoint["model"], strict=True
66
  )
67
+ del checkpoint
68
  assert not missing_keys
69
  model.eval()
70
 
 
72
  audio_tokenizer = AudioTokenizer(device)
73
 
74
  # ASR
75
+ whisper_model = whisper.load_model("medium").to(device)
76
 
77
  # Voice Presets
78
  preset_list = os.walk("./presets/").__next__()[2]
 
160
 
161
  def make_prompt(name, wav, sr, save=True):
162
  global whisper_model
 
163
  if not isinstance(wav, torch.FloatTensor):
164
  wav = torch.tensor(wav)
165
  if wav.abs().max() > 1:
 
179
  os.remove(f"./prompts/{name}.wav")
180
  os.remove(f"./prompts/{name}.txt")
181
 
 
 
182
  return text, lang
183
 
184
  @torch.no_grad()
 
186
  if len(text) > 150:
187
  return "Rejected, Text too long (should be less than 150 characters)", None
188
  global model, text_collater, text_tokenizer, audio_tokenizer
 
189
  audio_prompt = audio_prompt if audio_prompt is not None else record_audio_prompt
190
  sr, wav_pr = audio_prompt
191
  if len(wav_pr) / sr > 15:
 
214
  lang = token2lang[lang_token]
215
  text = lang_token + text + lang_token
216
 
 
 
 
217
  # tokenize audio
218
  encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
219
  audio_prompts = encoded_frames[0][0].transpose(2, 1).to(device)
 
252
  [(encoded_frames.transpose(2, 1), None)]
253
  )
254
 
 
 
 
 
255
  message = f"text prompt: {text_pr}\nsythesized text: {text}"
256
  return message, (24000, samples[0][0].cpu().numpy())
257
 
 
260
  if len(text) > 150:
261
  return "Rejected, Text too long (should be less than 150 characters)", None
262
  clear_prompts()
 
263
  # text to synthesize
264
  if language == 'auto-detect':
265
  lang_token = lang2token[langid.classify(text)[0]]
 
307
  samples = audio_tokenizer.decode(
308
  [(encoded_frames.transpose(2, 1), None)]
309
  )
 
 
310
 
311
  message = f"sythesized text: {text}"
312
  return message, (24000, samples[0][0].cpu().numpy())
 
324
  return "Rejected, Text too long (should be less than 1000 characters)", None
325
  mode = 'fixed-prompt'
326
  global model, audio_tokenizer, text_tokenizer, text_collater
 
327
  if (prompt is None or prompt == "") and preset_prompt == "":
328
  mode = 'sliding-window' # If no prompt is given, use sliding-window mode
329
  sentences = split_text_into_sentences(text)
 
395
  samples = audio_tokenizer.decode(
396
  [(complete_tokens, None)]
397
  )
 
398
  message = f"Cut into {len(sentences)} sentences"
399
  return message, (24000, samples[0][0].cpu().numpy())
400
  elif mode == "sliding-window":
 
441
  samples = audio_tokenizer.decode(
442
  [(complete_tokens, None)]
443
  )
 
444
  message = f"Cut into {len(sentences)} sentences"
445
  return message, (24000, samples[0][0].cpu().numpy())
446
  else: