Spaces:
Paused
Paused
Commit
•
9d33eb1
1
Parent(s):
9d0a4ee
Update app.py
Browse files
app.py
CHANGED
@@ -508,26 +508,11 @@ from pydub import AudioSegment
|
|
508 |
second_of_silence = AudioSegment.silent() # use default
|
509 |
second_of_silence.export("sil.wav", format='wav')
|
510 |
|
511 |
-
|
512 |
-
def
|
513 |
-
# Must set autoplay to True first
|
514 |
-
yield (history, chatbot_role, "", wave_header_chunk() )
|
515 |
-
for sentence, history in get_sentence(history,chatbot_role):
|
516 |
-
if sentence != "":
|
517 |
-
print("BG: inserting sentence to queue")
|
518 |
-
|
519 |
-
generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True)
|
520 |
-
if generated_speech is not None:
|
521 |
-
_, audio_dict = generated_speech
|
522 |
-
# We are using byte streaming
|
523 |
-
yield (history, chatbot_role, sentence, audio_dict["value"] )
|
524 |
-
|
525 |
-
|
526 |
-
# will generate speech audio file per sentence
|
527 |
-
def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=False):
|
528 |
language = "autodetect"
|
529 |
|
530 |
-
total_wav_bytestream = b""
|
531 |
|
532 |
if len(sentence)==0:
|
533 |
print("EMPTY SENTENCE")
|
@@ -558,12 +543,14 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
|
|
558 |
# sentence = sentence[:-1] + " " + sentence[-1]
|
559 |
|
560 |
# regex does the job well
|
561 |
-
sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
|
562 |
|
563 |
print("Sentence for speech:", sentence)
|
564 |
|
|
|
|
|
565 |
try:
|
566 |
-
if len(sentence)<SENTENCE_SPLIT_LENGTH:
|
567 |
# no problem continue on
|
568 |
sentence_list = [sentence]
|
569 |
else:
|
@@ -572,10 +559,13 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
|
|
572 |
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
573 |
# sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
|
574 |
sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
|
575 |
-
|
|
|
576 |
|
577 |
for sentence in sentence_list:
|
578 |
|
|
|
|
|
579 |
if any(c.isalnum() for c in sentence):
|
580 |
if language=="autodetect":
|
581 |
#on first call autodetect, nexts sentence calls will use same language
|
@@ -589,11 +579,11 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
|
|
589 |
# likely got a ' or " or some other text without alphanumeric in it
|
590 |
audio_stream = None
|
591 |
|
592 |
-
sentence_wav_bytestream = b""
|
593 |
-
|
594 |
# XTTS is actually using streaming response but we are playing audio by sentence
|
595 |
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
596 |
if audio_stream is not None:
|
|
|
|
|
597 |
# frame_length = 0
|
598 |
for chunk in audio_stream:
|
599 |
try:
|
@@ -604,27 +594,23 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
|
|
604 |
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
605 |
continue
|
606 |
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
|
|
|
|
|
|
620 |
|
621 |
-
if audio_stream is not None:
|
622 |
-
return (history, base64_audio)
|
623 |
-
else:
|
624 |
-
# Handle the case where the audio stream is None (e.g., silent response)
|
625 |
-
return (history, None)
|
626 |
-
|
627 |
-
|
628 |
except RuntimeError as e:
|
629 |
if "device-side assert" in str(e):
|
630 |
# cannot do anything on cuda device side error, need tor estart
|
@@ -641,8 +627,7 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
|
|
641 |
print("RuntimeError: non device-side assert error:", str(e))
|
642 |
raise e
|
643 |
|
644 |
-
|
645 |
-
return
|
646 |
|
647 |
latent_map = {}
|
648 |
latent_map["Cloée"] = get_latents("voices/cloee-1.wav")
|
@@ -673,15 +658,8 @@ def generate_story_and_speech(secret_token, input_text, chatbot_role):
|
|
673 |
# Convert the list of lists back into a list of tuples for the history
|
674 |
history_tuples = [tuple(entry) for entry in last_history]
|
675 |
|
676 |
-
|
677 |
|
678 |
-
if synthesized_speech:
|
679 |
-
# Retrieve the base64 audio string from the tuple
|
680 |
-
base64_audio = synthesized_speech[1]
|
681 |
-
return {"text": story_text.strip(), "audio": base64_audio}
|
682 |
-
else:
|
683 |
-
return {"text": "Failed to generate story (no synthesized speech)", "audio": None}
|
684 |
-
|
685 |
else:
|
686 |
return {"text": "Failed to generate story (last_history is empty)", "audio": None}
|
687 |
|
|
|
508 |
second_of_silence = AudioSegment.silent() # use default
|
509 |
second_of_silence.export("sil.wav", format='wav')
|
510 |
|
511 |
+
|
512 |
+
def generate_speech_from_history(history, chatbot_role, sentence):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
language = "autodetect"
|
514 |
|
515 |
+
# total_wav_bytestream = b""
|
516 |
|
517 |
if len(sentence)==0:
|
518 |
print("EMPTY SENTENCE")
|
|
|
543 |
# sentence = sentence[:-1] + " " + sentence[-1]
|
544 |
|
545 |
# regex does the job well
|
546 |
+
sentence = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
|
547 |
|
548 |
print("Sentence for speech:", sentence)
|
549 |
|
550 |
+
results = []
|
551 |
+
|
552 |
try:
|
553 |
+
if len(sentence) < SENTENCE_SPLIT_LENGTH:
|
554 |
# no problem continue on
|
555 |
sentence_list = [sentence]
|
556 |
else:
|
|
|
559 |
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
560 |
# sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
|
561 |
sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
|
562 |
+
|
563 |
+
print("detected sentences:", sentence_list)
|
564 |
|
565 |
for sentence in sentence_list:
|
566 |
|
567 |
+
print("- sentence = ", sentence)
|
568 |
+
|
569 |
if any(c.isalnum() for c in sentence):
|
570 |
if language=="autodetect":
|
571 |
#on first call autodetect, nexts sentence calls will use same language
|
|
|
579 |
# likely got a ' or " or some other text without alphanumeric in it
|
580 |
audio_stream = None
|
581 |
|
|
|
|
|
582 |
# XTTS is actually using streaming response but we are playing audio by sentence
|
583 |
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
584 |
if audio_stream is not None:
|
585 |
+
sentence_wav_bytestream = b""
|
586 |
+
|
587 |
# frame_length = 0
|
588 |
for chunk in audio_stream:
|
589 |
try:
|
|
|
594 |
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
595 |
continue
|
596 |
|
597 |
+
# Filter output for better voice
|
598 |
+
filter_output=False
|
599 |
+
if filter_output:
|
600 |
+
data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
|
601 |
+
float_data = data_s16 * 0.5**15
|
602 |
+
reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
|
603 |
+
sentence_wav_bytestream = (reduced_noise * 32767).astype(np.int16)
|
604 |
+
sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
|
605 |
+
|
606 |
+
# Directly encode the WAV bytestream to base64
|
607 |
+
base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
|
608 |
+
|
609 |
+
results.append({ "text": sentence, "audio": base64_audio })
|
610 |
+
else:
|
611 |
+
# Handle the case where the audio stream is None (e.g., silent response)
|
612 |
+
results.append({ "text": sentence, "audio": "" })
|
613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
614 |
except RuntimeError as e:
|
615 |
if "device-side assert" in str(e):
|
616 |
# cannot do anything on cuda device side error, need tor estart
|
|
|
627 |
print("RuntimeError: non device-side assert error:", str(e))
|
628 |
raise e
|
629 |
|
630 |
+
return results
|
|
|
631 |
|
632 |
latent_map = {}
|
633 |
latent_map["Cloée"] = get_latents("voices/cloee-1.wav")
|
|
|
658 |
# Convert the list of lists back into a list of tuples for the history
|
659 |
history_tuples = [tuple(entry) for entry in last_history]
|
660 |
|
661 |
+
return generate_speech_from_history(history_tuples, chatbot_role, story_text)
|
662 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
663 |
else:
|
664 |
return {"text": "Failed to generate story (last_history is empty)", "audio": None}
|
665 |
|