llama-3.2-3b-voice-webrtc

Running

App Files Files Community

akhaliq HF staff commited on Sep 27

Commit

7fa63ee

•

1 Parent(s): ba47ac1

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -26

app.py CHANGED Viewed

@@ -55,9 +55,19 @@ def process_audio(audio: tuple, state: AppState):
     state.pause_detected = pause_detected
     if state.pause_detected:
-        return gr.update(recording=False), state
     else:
-        return None, state
 def generate_response_and_audio(audio_bytes: bytes, state: AppState):
     if state.client is None:
@@ -76,20 +86,39 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
                 "tts_audio_bitrate": bitrate
             },
             model="llama3.1-8b",
-            messages=[{"role": "user", "content": [{"type": "audio", "data": audio_data}]}],
             temperature=0.7,
             max_tokens=256,
             stream=True,
         )
         for chunk in stream:
             if not chunk.choices:
                 continue
-            content = chunk.choices[0].delta.content
-            audio = getattr(chunk.choices[0], 'audio', [])
-            if content or audio:
-                audio_bytes = b''.join([base64.b64decode(a) for a in audio]) if audio else None
-                yield content, audio_bytes, state
     except Exception as e:
         raise gr.Error(f"Error during audio streaming: {e}")
@@ -110,18 +139,13 @@ def response(state: AppState):
     generator = generate_response_and_audio(audio_buffer.getvalue(), state)
-    # Add the user's audio input to the conversation
-    state.conversation.append({"role": "user", "content": "Audio input"})
-    # Prepare assistant's message
-    assistant_message = {"role": "assistant", "content": ""}
-    state.conversation.append(assistant_message)
-    for text, audio, updated_state in generator:
-        if text:
-            assistant_message["content"] += text
         state = updated_state
-        chatbot_output = state.conversation[-2:]  # Get the last two messages
         yield chatbot_output, audio, state
     # Reset the audio stream for the next interaction
@@ -156,7 +180,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
         with gr.Column():
             chatbot = gr.Chatbot(label="Conversation", type="messages")
             output_audio = gr.Audio(label="Output Audio", autoplay=True)
@@ -166,18 +190,22 @@ with gr.Blocks() as demo:
     set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
     format_dropdown.change(update_format, inputs=[format_dropdown, state], outputs=[state])
     stream = input_audio.stream(
         process_audio,
         [input_audio, state],
-        [input_audio, state],
         stream_every=0.25,  # Reduced to make it more responsive
         time_limit=60,  # Increased to allow for longer messages
     )
-    respond = input_audio.stop_recording(
         response,
-        [state],
-        [chatbot, output_audio, state],
     )
     # Automatically restart recording after the assistant's response
@@ -190,6 +218,6 @@ with gr.Blocks() as demo:
     # Add a "Stop Conversation" button
     cancel = gr.Button("Stop Conversation", variant="stop")
     cancel.click(lambda: (AppState(stopped=True), gr.update(recording=False)), None,
-                 [state, input_audio], cancels=[respond, restart])
-    demo.launch()

     state.pause_detected = pause_detected
     if state.pause_detected:
+        # Stop recording and trigger response
+        return gr.update(recording=False), state, True
     else:
+        return None, state, False
+def update_or_append_conversation(conversation, id, role, content):
+    # Find if there's an existing message with the given id
+    for message in conversation:
+        if message.get("id") == id and message.get("role") == role:
+            message["content"] = content
+            return
+    # If not found, append a new message
+    conversation.append({"id": id, "role": role, "content": content})
 def generate_response_and_audio(audio_bytes: bytes, state: AppState):
     if state.client is None:
                 "tts_audio_bitrate": bitrate
             },
             model="llama3.1-8b",
+            messages=state.conversation + [{"role": "user", "content": [{"type": "audio", "data": audio_data}]}],
             temperature=0.7,
             max_tokens=256,
             stream=True,
         )
+        id = str(time.time())
+        full_response = ""
+        asr_result = ""
+        audio_bytes_accumulated = b''
         for chunk in stream:
             if not chunk.choices:
                 continue
+            delta = chunk.choices[0].delta
+            content = delta.get("content", "")
+            audio = getattr(chunk.choices[0], "audio", [])
+            asr_results = getattr(chunk.choices[0], "asr_results", [])
+            if asr_results:
+                asr_result += "".join(asr_results)
+                yield id, None, asr_result, None, state
+            if content:
+                full_response += content
+                yield id, full_response, None, None, state
+            if audio:
+                # Accumulate audio bytes and yield them
+                audio_bytes_accumulated += b''.join([base64.b64decode(a) for a in audio])
+                yield id, None, None, audio_bytes_accumulated, state
+        yield id, full_response, asr_result, audio_bytes_accumulated, state
     except Exception as e:
         raise gr.Error(f"Error during audio streaming: {e}")
     generator = generate_response_and_audio(audio_buffer.getvalue(), state)
+    for id, text, asr, audio, updated_state in generator:
         state = updated_state
+        if asr:
+            update_or_append_conversation(state.conversation, id, "user", asr)
+        if text:
+            update_or_append_conversation(state.conversation, id, "assistant", text)
+        chatbot_output = state.conversation
         yield chatbot_output, audio, state
     # Reset the audio stream for the next interaction
     with gr.Row():
         with gr.Column():
+            input_audio = gr.Audio(label="Input Audio", source="microphone", type="numpy")
         with gr.Column():
             chatbot = gr.Chatbot(label="Conversation", type="messages")
             output_audio = gr.Audio(label="Output Audio", autoplay=True)
     set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
     format_dropdown.change(update_format, inputs=[format_dropdown, state], outputs=[state])
+    # Add a dummy output to trigger the response function
+    should_process_response = gr.Variable(False)
     stream = input_audio.stream(
         process_audio,
         [input_audio, state],
+        [input_audio, state, should_process_response],
         stream_every=0.25,  # Reduced to make it more responsive
         time_limit=60,  # Increased to allow for longer messages
     )
+    # When should_process_response is True, call response
+    stream.then(
         response,
+        inputs=[state],
+        outputs=[chatbot, output_audio, state]
     )
     # Automatically restart recording after the assistant's response
     # Add a "Stop Conversation" button
     cancel = gr.Button("Stop Conversation", variant="stop")
     cancel.click(lambda: (AppState(stopped=True), gr.update(recording=False)), None,
+                 [state, input_audio], cancels=[stream, restart])
+    demo.launch(queue=True)