jbilcke-hf HF staff commited on
Commit
9d33eb1
1 Parent(s): 9d0a4ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -53
app.py CHANGED
@@ -508,26 +508,11 @@ from pydub import AudioSegment
508
  second_of_silence = AudioSegment.silent() # use default
509
  second_of_silence.export("sil.wav", format='wav')
510
 
511
-
512
- def generate_speech(history,chatbot_role):
513
- # Must set autoplay to True first
514
- yield (history, chatbot_role, "", wave_header_chunk() )
515
- for sentence, history in get_sentence(history,chatbot_role):
516
- if sentence != "":
517
- print("BG: inserting sentence to queue")
518
-
519
- generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True)
520
- if generated_speech is not None:
521
- _, audio_dict = generated_speech
522
- # We are using byte streaming
523
- yield (history, chatbot_role, sentence, audio_dict["value"] )
524
-
525
-
526
- # will generate speech audio file per sentence
527
- def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=False):
528
  language = "autodetect"
529
 
530
- total_wav_bytestream = b""
531
 
532
  if len(sentence)==0:
533
  print("EMPTY SENTENCE")
@@ -558,12 +543,14 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
558
  # sentence = sentence[:-1] + " " + sentence[-1]
559
 
560
  # regex does the job well
561
- sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
562
 
563
  print("Sentence for speech:", sentence)
564
 
 
 
565
  try:
566
- if len(sentence)<SENTENCE_SPLIT_LENGTH:
567
  # no problem continue on
568
  sentence_list = [sentence]
569
  else:
@@ -572,10 +559,13 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
572
  # Do whatever necessary, first break at hypens then spaces and then even split very long words
573
  # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
574
  sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
575
- print("SPLITTED LONG SENTENCE:",sentence_list)
 
576
 
577
  for sentence in sentence_list:
578
 
 
 
579
  if any(c.isalnum() for c in sentence):
580
  if language=="autodetect":
581
  #on first call autodetect, nexts sentence calls will use same language
@@ -589,11 +579,11 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
589
  # likely got a ' or " or some other text without alphanumeric in it
590
  audio_stream = None
591
 
592
- sentence_wav_bytestream = b""
593
-
594
  # XTTS is actually using streaming response but we are playing audio by sentence
595
  # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
596
  if audio_stream is not None:
 
 
597
  # frame_length = 0
598
  for chunk in audio_stream:
599
  try:
@@ -604,27 +594,23 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
604
  # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
605
  continue
606
 
607
- # Filter output for better voice
608
- filter_output=True
609
- if filter_output:
610
- data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
611
- float_data = data_s16 * 0.5**15
612
- reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
613
- sentence_wav_bytestream = (reduced_noise * 32767).astype(np.int16)
614
- sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
615
-
616
- total_wav_bytestream += sentence_wav_bytestream
617
-
618
- # Directly encode the WAV bytestream to base64
619
- base64_audio = base64.b64encode(pcm_to_wav(total_wav_bytestream)).decode('utf8')
 
 
 
620
 
621
- if audio_stream is not None:
622
- return (history, base64_audio)
623
- else:
624
- # Handle the case where the audio stream is None (e.g., silent response)
625
- return (history, None)
626
-
627
-
628
  except RuntimeError as e:
629
  if "device-side assert" in str(e):
630
  # cannot do anything on cuda device side error, need tor estart
@@ -641,8 +627,7 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
641
  print("RuntimeError: non device-side assert error:", str(e))
642
  raise e
643
 
644
- print("All speech ended")
645
- return
646
 
647
  latent_map = {}
648
  latent_map["Cloée"] = get_latents("voices/cloee-1.wav")
@@ -673,15 +658,8 @@ def generate_story_and_speech(secret_token, input_text, chatbot_role):
673
  # Convert the list of lists back into a list of tuples for the history
674
  history_tuples = [tuple(entry) for entry in last_history]
675
 
676
- synthesized_speech = generate_speech_for_sentence(history_tuples, chatbot_role, story_text, return_as_byte=True)
677
 
678
- if synthesized_speech:
679
- # Retrieve the base64 audio string from the tuple
680
- base64_audio = synthesized_speech[1]
681
- return {"text": story_text.strip(), "audio": base64_audio}
682
- else:
683
- return {"text": "Failed to generate story (no synthesized speech)", "audio": None}
684
-
685
  else:
686
  return {"text": "Failed to generate story (last_history is empty)", "audio": None}
687
 
 
508
  second_of_silence = AudioSegment.silent() # use default
509
  second_of_silence.export("sil.wav", format='wav')
510
 
511
+
512
+ def generate_speech_from_history(history, chatbot_role, sentence):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  language = "autodetect"
514
 
515
+ # total_wav_bytestream = b""
516
 
517
  if len(sentence)==0:
518
  print("EMPTY SENTENCE")
 
543
  # sentence = sentence[:-1] + " " + sentence[-1]
544
 
545
  # regex does the job well
546
+ sentence = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
547
 
548
  print("Sentence for speech:", sentence)
549
 
550
+ results = []
551
+
552
  try:
553
+ if len(sentence) < SENTENCE_SPLIT_LENGTH:
554
  # no problem continue on
555
  sentence_list = [sentence]
556
  else:
 
559
  # Do whatever necessary, first break at hypens then spaces and then even split very long words
560
  # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
561
  sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
562
+
563
+ print("detected sentences:", sentence_list)
564
 
565
  for sentence in sentence_list:
566
 
567
+ print("- sentence = ", sentence)
568
+
569
  if any(c.isalnum() for c in sentence):
570
  if language=="autodetect":
571
  #on first call autodetect, nexts sentence calls will use same language
 
579
  # likely got a ' or " or some other text without alphanumeric in it
580
  audio_stream = None
581
 
 
 
582
  # XTTS is actually using streaming response but we are playing audio by sentence
583
  # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
584
  if audio_stream is not None:
585
+ sentence_wav_bytestream = b""
586
+
587
  # frame_length = 0
588
  for chunk in audio_stream:
589
  try:
 
594
  # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
595
  continue
596
 
597
+ # Filter output for better voice
598
+ filter_output=False
599
+ if filter_output:
600
+ data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
601
+ float_data = data_s16 * 0.5**15
602
+ reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
603
+ sentence_wav_bytestream = (reduced_noise * 32767).astype(np.int16)
604
+ sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
605
+
606
+ # Directly encode the WAV bytestream to base64
607
+ base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
608
+
609
+ results.append({ "text": sentence, "audio": base64_audio })
610
+ else:
611
+ # Handle the case where the audio stream is None (e.g., silent response)
612
+ results.append({ "text": sentence, "audio": "" })
613
 
 
 
 
 
 
 
 
614
  except RuntimeError as e:
615
  if "device-side assert" in str(e):
616
  # cannot do anything on cuda device side error, need tor estart
 
627
  print("RuntimeError: non device-side assert error:", str(e))
628
  raise e
629
 
630
+ return results
 
631
 
632
  latent_map = {}
633
  latent_map["Cloée"] = get_latents("voices/cloee-1.wav")
 
658
  # Convert the list of lists back into a list of tuples for the history
659
  history_tuples = [tuple(entry) for entry in last_history]
660
 
661
+ return generate_speech_from_history(history_tuples, chatbot_role, story_text)
662
 
 
 
 
 
 
 
 
663
  else:
664
  return {"text": "Failed to generate story (last_history is empty)", "audio": None}
665