Spaces:

akhaliq
/

llama-3.2-3b-voice

Running

App Files Files Community

akhaliq HF staff commited on Sep 27

Commit

649a30c

•

1 Parent(s): dec22aa

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -31

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ from dataclasses import dataclass, field
 from threading import Lock
 import base64
 @dataclass
 class AppState:
     stream: np.ndarray | None = None
@@ -83,30 +82,22 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
             stream=True,
         )
-        full_response = ""
-        audios = []
         for chunk in stream:
             if not chunk.choices:
                 continue
             content = chunk.choices[0].delta.content
             audio = getattr(chunk.choices[0], 'audio', [])
-            if content:
-                full_response += content
-                yield full_response, None, state
-            if audio:
-                audios.extend(audio)
-        final_audio = b''.join([base64.b64decode(a) for a in audios])
-        yield full_response, final_audio, state
     except Exception as e:
         raise gr.Error(f"Error during audio streaming: {e}")
 def response(state: AppState):
     if state.stream is None or len(state.stream) == 0:
-        return None, None, state
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
@@ -119,26 +110,24 @@ def response(state: AppState):
     generator = generate_response_and_audio(audio_buffer.getvalue(), state)
-    # Process the generator to get the final results
-    final_text = ""
-    final_audio = None
     for text, audio, updated_state in generator:
-        final_text = text if text else final_text
-        final_audio = audio if audio else final_audio
         state = updated_state
-    # Update the chatbot with the final conversation
-    state.conversation.append({"role": "user", "content": "Audio input"})
-    state.conversation.append({"role": "assistant", "content": final_text})
     # Reset the audio stream for the next interaction
     state.stream = None
     state.pause_detected = False
-    chatbot_output = state.conversation[-2:]  # Get the last two messages
-    return chatbot_output, final_audio, state
 def start_recording_user(state: AppState):
     if not state.stopped:
         return gr.Audio(recording=True)
@@ -167,7 +156,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
         with gr.Column():
             chatbot = gr.Chatbot(label="Conversation", type="messages")
             output_audio = gr.Audio(label="Output Audio", autoplay=True)
@@ -188,10 +177,9 @@ with gr.Blocks() as demo:
     respond = input_audio.stop_recording(
         response,
         [state],
-        [chatbot, output_audio, state]
     )
-    # Update the chatbot with the final conversation
-    respond.then(lambda s: s.conversation, [state], [chatbot])
     # Automatically restart recording after the assistant's response
     restart = output_audio.stop(

 from threading import Lock
 import base64
 @dataclass
 class AppState:
     stream: np.ndarray | None = None
             stream=True,
         )
         for chunk in stream:
             if not chunk.choices:
                 continue
             content = chunk.choices[0].delta.content
             audio = getattr(chunk.choices[0], 'audio', [])
+            if content or audio:
+                audio_bytes = b''.join([base64.b64decode(a) for a in audio]) if audio else None
+                yield content, audio_bytes, state
     except Exception as e:
         raise gr.Error(f"Error during audio streaming: {e}")
 def response(state: AppState):
     if state.stream is None or len(state.stream) == 0:
+        yield None, None, state
+        return
     audio_buffer = io.BytesIO()
     segment = AudioSegment(
     generator = generate_response_and_audio(audio_buffer.getvalue(), state)
+    # Add the user's audio input to the conversation
+    state.conversation.append({"role": "user", "content": "Audio input"})
+    # Prepare assistant's message
+    assistant_message = {"role": "assistant", "content": ""}
+    state.conversation.append(assistant_message)
     for text, audio, updated_state in generator:
+        if text:
+            assistant_message["content"] += text
         state = updated_state
+        chatbot_output = state.conversation[-2:]  # Get the last two messages
+        yield chatbot_output, audio, state
     # Reset the audio stream for the next interaction
     state.stream = None
     state.pause_detected = False
 def start_recording_user(state: AppState):
     if not state.stopped:
         return gr.Audio(recording=True)
     with gr.Row():
         with gr.Column():
+            input_audio = gr.Audio(label="Input Audio", source="microphone", type="numpy")
         with gr.Column():
             chatbot = gr.Chatbot(label="Conversation", type="messages")
             output_audio = gr.Audio(label="Output Audio", autoplay=True)
     respond = input_audio.stop_recording(
         response,
         [state],
+        [chatbot, output_audio, state],
+        every=1  # Ensures outputs are updated as they are yielded
     )
     # Automatically restart recording after the assistant's response
     restart = output_audio.stop(