ai-story-server

Paused

App Files Files Community

jbilcke-hf HF staff commited on Nov 21, 2023

Commit

9d33eb1

•

1 Parent(s): 9d0a4ee

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -53

app.py CHANGED Viewed

@@ -508,26 +508,11 @@ from pydub import AudioSegment
 second_of_silence = AudioSegment.silent() # use default
 second_of_silence.export("sil.wav", format='wav')
-def generate_speech(history,chatbot_role):
-    # Must set autoplay to True first
-    yield (history, chatbot_role, "", wave_header_chunk() )
-    for sentence, history in get_sentence(history,chatbot_role):
-        if sentence != "":
-            print("BG: inserting sentence to queue")
-            generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True)
-            if generated_speech is not None:
-                _, audio_dict = generated_speech
-                # We are using byte streaming
-                yield (history, chatbot_role, sentence, audio_dict["value"] )
-# will generate speech audio file per sentence
-def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=False):
     language = "autodetect"
-    total_wav_bytestream = b""
     if len(sentence)==0:
         print("EMPTY SENTENCE")
@@ -558,12 +543,14 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
     #    sentence = sentence[:-1] + " " + sentence[-1]
     # regex does the job well
-    sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
     print("Sentence for speech:", sentence)
     try:
-        if len(sentence)<SENTENCE_SPLIT_LENGTH:
             # no problem continue on
             sentence_list = [sentence]
         else:
@@ -572,10 +559,13 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
             # Do whatever necessary, first break at hypens then spaces and then even split very long words
             # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
             sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
-            print("SPLITTED LONG SENTENCE:",sentence_list)
         for sentence in sentence_list:
             if any(c.isalnum() for c in sentence):
                 if language=="autodetect":
                     #on first call autodetect, nexts sentence calls will use same language
@@ -589,11 +579,11 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
                 # likely got a ' or " or some other text without alphanumeric in it
                 audio_stream = None
-            sentence_wav_bytestream = b""
             # XTTS is actually using streaming response but we are playing audio by sentence
             # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
                 # frame_length = 0
                 for chunk in audio_stream:
                     try:
@@ -604,27 +594,23 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
                         # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
                         continue
-            # Filter output for better voice
-            filter_output=True
-            if filter_output:
-                data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
-                float_data = data_s16 * 0.5**15
-                reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
-                sentence_wav_bytestream = (reduced_noise * 32767).astype(np.int16)
-                sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
-            total_wav_bytestream += sentence_wav_bytestream
-        # Directly encode the WAV bytestream to base64
-        base64_audio = base64.b64encode(pcm_to_wav(total_wav_bytestream)).decode('utf8')
-        if audio_stream is not None:
-            return (history, base64_audio)
-        else:
-            # Handle the case where the audio stream is None (e.g., silent response)
-            return (history, None)
     except RuntimeError as e:
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need tor estart
@@ -641,8 +627,7 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
             print("RuntimeError: non device-side assert error:", str(e))
             raise e
-    print("All speech ended")
-    return
 latent_map = {}
 latent_map["Cloée"] = get_latents("voices/cloee-1.wav")
@@ -673,15 +658,8 @@ def generate_story_and_speech(secret_token, input_text, chatbot_role):
         # Convert the list of lists back into a list of tuples for the history
         history_tuples = [tuple(entry) for entry in last_history]
-        synthesized_speech = generate_speech_for_sentence(history_tuples, chatbot_role, story_text, return_as_byte=True)
-        if synthesized_speech:
-            # Retrieve the base64 audio string from the tuple
-            base64_audio = synthesized_speech[1]
-            return {"text": story_text.strip(), "audio": base64_audio}
-        else:
-            return {"text": "Failed to generate story (no synthesized speech)", "audio": None}
     else:
         return {"text": "Failed to generate story (last_history is empty)", "audio": None}

 second_of_silence = AudioSegment.silent() # use default
 second_of_silence.export("sil.wav", format='wav')
+def generate_speech_from_history(history, chatbot_role, sentence):
     language = "autodetect"
+    # total_wav_bytestream = b""
     if len(sentence)==0:
         print("EMPTY SENTENCE")
     #    sentence = sentence[:-1] + " " + sentence[-1]
     # regex does the job well
+    sentence = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
     print("Sentence for speech:", sentence)
+    results = []
     try:
+        if len(sentence) < SENTENCE_SPLIT_LENGTH:
             # no problem continue on
             sentence_list = [sentence]
         else:
             # Do whatever necessary, first break at hypens then spaces and then even split very long words
             # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
             sentence_list = split_sentences(sentence, SENTENCE_SPLIT_LENGTH)
+        print("detected sentences:", sentence_list)
         for sentence in sentence_list:
+            print("- sentence = ", sentence)
             if any(c.isalnum() for c in sentence):
                 if language=="autodetect":
                     #on first call autodetect, nexts sentence calls will use same language
                 # likely got a ' or " or some other text without alphanumeric in it
                 audio_stream = None
             # XTTS is actually using streaming response but we are playing audio by sentence
             # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
             if audio_stream is not None:
+                sentence_wav_bytestream = b""
                 # frame_length = 0
                 for chunk in audio_stream:
                     try:
                         # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
                         continue
+                # Filter output for better voice
+                filter_output=False
+                if filter_output:
+                    data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
+                    float_data = data_s16 * 0.5**15
+                    reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
+                    sentence_wav_bytestream = (reduced_noise * 32767).astype(np.int16)
+                    sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
+                # Directly encode the WAV bytestream to base64
+                base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
+                results.append({ "text": sentence, "audio": base64_audio })
+            else:
+                # Handle the case where the audio stream is None (e.g., silent response)
+                results.append({ "text": sentence, "audio": "" })
     except RuntimeError as e:
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need tor estart
             print("RuntimeError: non device-side assert error:", str(e))
             raise e
+    return results
 latent_map = {}
 latent_map["Cloée"] = get_latents("voices/cloee-1.wav")
         # Convert the list of lists back into a list of tuples for the history
         history_tuples = [tuple(entry) for entry in last_history]
+        return generate_speech_from_history(history_tuples, chatbot_role, story_text)
     else:
         return {"text": "Failed to generate story (last_history is empty)", "audio": None}