ggoknar commited on
Commit
d3d83c1
·
1 Parent(s): f34dc34

Fixed STT to TTS and uses streaming TTS

Browse files
Files changed (1) hide show
  1. app.py +146 -58
app.py CHANGED
@@ -19,6 +19,7 @@ from scipy.io.wavfile import write
19
  from pydub import AudioSegment
20
  import ffmpeg
21
 
 
22
  import librosa
23
  import torchaudio
24
  from TTS.api import TTS
@@ -31,7 +32,6 @@ from TTS.utils.generic_utils import get_user_data_dir
31
  # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
32
  AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 1))
33
 
34
-
35
  # This will trigger downloading model
36
  print("Downloading if not downloaded Coqui XTTS V1")
37
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
@@ -68,7 +68,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
68
  # will use api to restart space on a unrecoverable error
69
  api = HfApi(token=HF_TOKEN)
70
 
71
- repo_id = "ylacombe/voice-chat-with-lama"
72
 
73
  default_system_message = """
74
  You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
@@ -97,12 +97,15 @@ import numpy as np
97
  from gradio_client import Client
98
  from huggingface_hub import InferenceClient
99
 
100
-
101
  # This client is down
102
  # whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
103
  # Replacement whisper client, it may be time limited
104
  whisper_client = Client("https://sanchit-gandhi-whisper-jax.hf.space")
105
- text_client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
 
 
 
106
 
107
 
108
  ###### COQUI TTS FUNCTIONS ######
@@ -180,13 +183,17 @@ def generate(
180
 
181
 
182
  def transcribe(wav_path):
183
- # get first element from whisper_jax and strip it to delete begin and end space
184
- return whisper_client.predict(
185
- wav_path, # str (filepath or URL to file) in 'inputs' Audio component
186
- "transcribe", # str in 'Task' Radio component
187
- False, # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
188
- api_name="/predict",
189
- )[0].strip()
 
 
 
 
190
 
191
 
192
  # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
@@ -257,6 +264,73 @@ def get_voice(prompt, language, latent_tuple, suffix="0"):
257
  return wav_filename
258
 
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  def get_sentence(history, system_prompt=""):
261
  history = [] if history is None else history
262
 
@@ -322,55 +396,33 @@ def generate_speech(history):
322
  try:
323
  # generate speech using precomputed latents
324
  # This is not streaming but it will be fast
325
- wav = get_voice(
 
326
  sentence, language, latent_map["Female_Voice"], suffix=len(wav_list)
327
  )
328
- wav_list.append(wav)
329
- yield (gr.Audio.update(value=wav, autoplay=True), history)
330
- wait_time = librosa.get_duration(path=wav)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  wait_time = AUDIO_WAIT_MODIFIER * wait_time
332
  print("Sleeping till audio end")
333
  time.sleep(wait_time)
334
-
335
- # Replace inside try with below to use streaming, though not perfectly working as each it will multiprocess with mistral generation
336
- # And would produce artifacts
337
- # giving sentence suffix so we can merge all to single audio at end
338
- # On mobile there is no autoplay support due to mobile security!
339
- """
340
- t_inference = time.time()
341
- chunks = model.inference_stream(
342
- sentence,
343
- language,
344
- latent_map["Female_Voice"][0],
345
- latent_map["Female_Voice"][2],)
346
-
347
- first_chunk=True
348
- wav_chunks=[]
349
- for i, chunk in enumerate(chunks):
350
- if first_chunk:
351
- first_chunk_time = time.time() - t_inference
352
- print(f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n")
353
- first_chunk=False
354
-
355
- wav_chunks.append(chunk)
356
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
357
-
358
- out_file = f'{i}.wav'
359
- write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
360
- audio = AudioSegment.from_file(out_file)
361
- audio.export(out_file, format='wav')
362
-
363
- yield (gr.Audio.update(value=out_file,autoplay=True) , history)
364
- #chunk sleep else next sentence may come in fast
365
- wait_time= librosa.get_duration(path=out_file)
366
- time.sleep(wait_time)
367
-
368
- wav = torch.cat(wav_chunks, dim=0)
369
- filename= f"output_{len(wav_list)}.wav"
370
- torchaudio.save(filename, wav.squeeze().unsqueeze(0).cpu(), 24000)
371
- wav_list.append(filename)
372
- """
373
-
374
  except RuntimeError as e:
375
  if "device-side assert" in str(e):
376
  # cannot do anything on cuda device side error, need tor estart
@@ -387,6 +439,40 @@ def generate_speech(history):
387
  print("RuntimeError: non device-side assert error:", str(e))
388
  raise e
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
  with gr.Blocks(title=title) as demo:
392
  gr.Markdown(DESCRIPTION)
@@ -410,12 +496,14 @@ with gr.Blocks(title=title) as demo:
410
 
411
  with gr.Row():
412
  audio = gr.Audio(
413
- type="numpy",
414
  streaming=False,
415
  autoplay=False,
416
- label="Generated audio response",
417
  show_label=True,
418
  )
 
 
419
 
420
  clear_btn = gr.ClearButton([chatbot, audio])
421
 
@@ -432,7 +520,7 @@ with gr.Blocks(title=title) as demo:
432
  txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
433
 
434
  file_msg = btn.stop_recording(
435
- add_file, [chatbot, btn], [chatbot], queue=False
436
  ).then(generate_speech, chatbot, [audio, chatbot])
437
 
438
  gr.Markdown(
 
19
  from pydub import AudioSegment
20
  import ffmpeg
21
 
22
+ import io, wave
23
  import librosa
24
  import torchaudio
25
  from TTS.api import TTS
 
32
  # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
33
  AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 1))
34
 
 
35
  # This will trigger downloading model
36
  print("Downloading if not downloaded Coqui XTTS V1")
37
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
 
68
  # will use api to restart space on a unrecoverable error
69
  api = HfApi(token=HF_TOKEN)
70
 
71
+ repo_id = "coqui/voice-chat-with-lama"
72
 
73
  default_system_message = """
74
  You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
 
97
  from gradio_client import Client
98
  from huggingface_hub import InferenceClient
99
 
100
+ WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 30))
101
  # This client is down
102
  # whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
103
  # Replacement whisper client, it may be time limited
104
  whisper_client = Client("https://sanchit-gandhi-whisper-jax.hf.space")
105
+ text_client = InferenceClient(
106
+ "mistralai/Mistral-7B-Instruct-v0.1",
107
+ timeout=WHISPER_TIMEOUT,
108
+ )
109
 
110
 
111
  ###### COQUI TTS FUNCTIONS ######
 
183
 
184
 
185
  def transcribe(wav_path):
186
+ try:
187
+ # get first element from whisper_jax and strip it to delete begin and end space
188
+ return whisper_client.predict(
189
+ wav_path, # str (filepath or URL to file) in 'inputs' Audio component
190
+ "transcribe", # str in 'Task' Radio component
191
+ False, # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
192
+ api_name="/predict",
193
+ )[0].strip()
194
+ except:
195
+ gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
196
+ return "There was a problem with my voice, tell me joke"
197
 
198
 
199
  # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
 
264
  return wav_filename
265
 
266
 
267
+ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
268
+ # This will create a wave header then append the frame input
269
+ # It should be first on a streaming wav file
270
+ # Other frames better should not have it (else you will hear some artifacts each chunk start)
271
+ wav_buf = io.BytesIO()
272
+ with wave.open(wav_buf, "wb") as vfout:
273
+ vfout.setnchannels(channels)
274
+ vfout.setsampwidth(sample_width)
275
+ vfout.setframerate(sample_rate)
276
+ vfout.writeframes(frame_input)
277
+
278
+ wav_buf.seek(0)
279
+ return wav_buf.read()
280
+
281
+
282
+ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
283
+ gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
284
+ try:
285
+ t0 = time.time()
286
+ chunks = model.inference_stream(
287
+ prompt,
288
+ language,
289
+ gpt_cond_latent,
290
+ speaker_embedding,
291
+ )
292
+
293
+ first_chunk = True
294
+ for i, chunk in enumerate(chunks):
295
+ if first_chunk:
296
+ first_chunk_time = time.time() - t0
297
+ metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
298
+ first_chunk = False
299
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
300
+
301
+ # In case output is required to be multiple voice files
302
+ # out_file = f'{char}_{i}.wav'
303
+ # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
304
+ # audio = AudioSegment.from_file(out_file)
305
+ # audio.export(out_file, format='wav')
306
+ # return out_file
307
+ # directly return chunk as bytes for streaming
308
+ chunk = chunk.detach().cpu().numpy().squeeze()
309
+ chunk = (chunk * 32767).astype(np.int16)
310
+
311
+ yield chunk.tobytes()
312
+
313
+ except RuntimeError as e:
314
+ if "device-side assert" in str(e):
315
+ # cannot do anything on cuda device side error, need tor estart
316
+ print(
317
+ f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
318
+ flush=True,
319
+ )
320
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
321
+ print("Cuda device-assert Runtime encountered need restart")
322
+
323
+ # HF Space specific.. This error is unrecoverable need to restart space
324
+ api.restart_space(repo_id=repo_id)
325
+ else:
326
+ print("RuntimeError: non device-side assert error:", str(e))
327
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
328
+ return None
329
+ return None
330
+ except:
331
+ return None
332
+
333
+
334
  def get_sentence(history, system_prompt=""):
335
  history = [] if history is None else history
336
 
 
396
  try:
397
  # generate speech using precomputed latents
398
  # This is not streaming but it will be fast
399
+ # wav = get_voice(sentence,language, latent_map["Female_Voice"], suffix=len(wav_list))
400
+ audio_stream = get_voice_streaming(
401
  sentence, language, latent_map["Female_Voice"], suffix=len(wav_list)
402
  )
403
+ wav_chunks = wave_header_chunk()
404
+ frame_length = 0
405
+ for chunk in audio_stream:
406
+ try:
407
+ wav_chunks += chunk
408
+ frame_length += len(chunk)
409
+ except:
410
+ # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
411
+ continue
412
+
413
+ wav_list.append(wav_chunks)
414
+ yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
415
+
416
+ # Streaming wait time calculation
417
+ # audio_length = frame_length / sample_width/ frame_rate
418
+ wait_time = frame_length / 2 / 24000 + 0.5 # plus 500ms
419
+
420
+ # for non streaming
421
+ # wait_time= librosa.get_duration(path=wav)
422
+
423
  wait_time = AUDIO_WAIT_MODIFIER * wait_time
424
  print("Sleeping till audio end")
425
  time.sleep(wait_time)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  except RuntimeError as e:
427
  if "device-side assert" in str(e):
428
  # cannot do anything on cuda device side error, need tor estart
 
439
  print("RuntimeError: non device-side assert error:", str(e))
440
  raise e
441
 
442
+ # Spoken on autoplay everysencen now produce a concataned one at the one
443
+ # requires pip install ffmpeg-python
444
+
445
+ # files_to_concat= [ffmpeg.input(w) for w in wav_list]
446
+ # combined_file_name="combined.wav"
447
+ # ffmpeg.concat(*files_to_concat,v=0, a=1).output(combined_file_name).run(overwrite_output=True)
448
+ # final_audio.update(value=combined_file_name, visible=True)
449
+ # yield (combined_file_name, history)
450
+
451
+
452
+ css = """
453
+ .bot .chatbot p {
454
+ overflow: hidden; /* Ensures the content is not revealed until the animation */
455
+ //border-right: .15em solid orange; /* The typwriter cursor */
456
+ white-space: nowrap; /* Keeps the content on a single line */
457
+ margin: 0 auto; /* Gives that scrolling effect as the typing happens */
458
+ letter-spacing: .15em; /* Adjust as needed */
459
+ animation:
460
+ typing 3.5s steps(40, end);
461
+ blink-caret .75s step-end infinite;
462
+ }
463
+
464
+ /* The typing effect */
465
+ @keyframes typing {
466
+ from { width: 0 }
467
+ to { width: 100% }
468
+ }
469
+
470
+ /* The typewriter cursor effect */
471
+ @keyframes blink-caret {
472
+ from, to { border-color: transparent }
473
+ 50% { border-color: orange; }
474
+ }
475
+ """
476
 
477
  with gr.Blocks(title=title) as demo:
478
  gr.Markdown(DESCRIPTION)
 
496
 
497
  with gr.Row():
498
  audio = gr.Audio(
499
+ label="Generated audio response",
500
  streaming=False,
501
  autoplay=False,
502
+ interactive=True,
503
  show_label=True,
504
  )
505
+ # TODO add a second audio that plays whole sentences (for mobile especially)
506
+ # final_audio = gr.Audio(label="Final audio response", streaming=False, autoplay=False, interactive=False,show_label=True, visible=False)
507
 
508
  clear_btn = gr.ClearButton([chatbot, audio])
509
 
 
520
  txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
521
 
522
  file_msg = btn.stop_recording(
523
+ add_file, [chatbot, btn], [chatbot, txt], queue=False
524
  ).then(generate_speech, chatbot, [audio, chatbot])
525
 
526
  gr.Markdown(