Update app.py
Browse files
app.py
CHANGED
@@ -55,9 +55,19 @@ def process_audio(audio: tuple, state: AppState):
|
|
55 |
state.pause_detected = pause_detected
|
56 |
|
57 |
if state.pause_detected:
|
58 |
-
|
|
|
59 |
else:
|
60 |
-
return None, state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
def generate_response_and_audio(audio_bytes: bytes, state: AppState):
|
63 |
if state.client is None:
|
@@ -76,20 +86,39 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
|
|
76 |
"tts_audio_bitrate": bitrate
|
77 |
},
|
78 |
model="llama3.1-8b",
|
79 |
-
messages=[{"role": "user", "content": [{"type": "audio", "data": audio_data}]}],
|
80 |
temperature=0.7,
|
81 |
max_tokens=256,
|
82 |
stream=True,
|
83 |
)
|
84 |
|
|
|
|
|
|
|
|
|
|
|
85 |
for chunk in stream:
|
86 |
if not chunk.choices:
|
87 |
continue
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
except Exception as e:
|
95 |
raise gr.Error(f"Error during audio streaming: {e}")
|
@@ -110,18 +139,13 @@ def response(state: AppState):
|
|
110 |
|
111 |
generator = generate_response_and_audio(audio_buffer.getvalue(), state)
|
112 |
|
113 |
-
|
114 |
-
state.conversation.append({"role": "user", "content": "Audio input"})
|
115 |
-
|
116 |
-
# Prepare assistant's message
|
117 |
-
assistant_message = {"role": "assistant", "content": ""}
|
118 |
-
state.conversation.append(assistant_message)
|
119 |
-
|
120 |
-
for text, audio, updated_state in generator:
|
121 |
-
if text:
|
122 |
-
assistant_message["content"] += text
|
123 |
state = updated_state
|
124 |
-
|
|
|
|
|
|
|
|
|
125 |
yield chatbot_output, audio, state
|
126 |
|
127 |
# Reset the audio stream for the next interaction
|
@@ -156,7 +180,7 @@ with gr.Blocks() as demo:
|
|
156 |
|
157 |
with gr.Row():
|
158 |
with gr.Column():
|
159 |
-
input_audio = gr.Audio(label="Input Audio",
|
160 |
with gr.Column():
|
161 |
chatbot = gr.Chatbot(label="Conversation", type="messages")
|
162 |
output_audio = gr.Audio(label="Output Audio", autoplay=True)
|
@@ -166,18 +190,22 @@ with gr.Blocks() as demo:
|
|
166 |
set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
|
167 |
format_dropdown.change(update_format, inputs=[format_dropdown, state], outputs=[state])
|
168 |
|
|
|
|
|
|
|
169 |
stream = input_audio.stream(
|
170 |
process_audio,
|
171 |
[input_audio, state],
|
172 |
-
[input_audio, state],
|
173 |
stream_every=0.25, # Reduced to make it more responsive
|
174 |
time_limit=60, # Increased to allow for longer messages
|
175 |
)
|
176 |
|
177 |
-
|
|
|
178 |
response,
|
179 |
-
[state],
|
180 |
-
[chatbot, output_audio, state]
|
181 |
)
|
182 |
|
183 |
# Automatically restart recording after the assistant's response
|
@@ -190,6 +218,6 @@ with gr.Blocks() as demo:
|
|
190 |
# Add a "Stop Conversation" button
|
191 |
cancel = gr.Button("Stop Conversation", variant="stop")
|
192 |
cancel.click(lambda: (AppState(stopped=True), gr.update(recording=False)), None,
|
193 |
-
[state, input_audio], cancels=[
|
194 |
|
195 |
-
demo.launch()
|
|
|
55 |
state.pause_detected = pause_detected
|
56 |
|
57 |
if state.pause_detected:
|
58 |
+
# Stop recording and trigger response
|
59 |
+
return gr.update(recording=False), state, True
|
60 |
else:
|
61 |
+
return None, state, False
|
62 |
+
|
63 |
+
def update_or_append_conversation(conversation, id, role, content):
|
64 |
+
# Find if there's an existing message with the given id
|
65 |
+
for message in conversation:
|
66 |
+
if message.get("id") == id and message.get("role") == role:
|
67 |
+
message["content"] = content
|
68 |
+
return
|
69 |
+
# If not found, append a new message
|
70 |
+
conversation.append({"id": id, "role": role, "content": content})
|
71 |
|
72 |
def generate_response_and_audio(audio_bytes: bytes, state: AppState):
|
73 |
if state.client is None:
|
|
|
86 |
"tts_audio_bitrate": bitrate
|
87 |
},
|
88 |
model="llama3.1-8b",
|
89 |
+
messages=state.conversation + [{"role": "user", "content": [{"type": "audio", "data": audio_data}]}],
|
90 |
temperature=0.7,
|
91 |
max_tokens=256,
|
92 |
stream=True,
|
93 |
)
|
94 |
|
95 |
+
id = str(time.time())
|
96 |
+
full_response = ""
|
97 |
+
asr_result = ""
|
98 |
+
audio_bytes_accumulated = b''
|
99 |
+
|
100 |
for chunk in stream:
|
101 |
if not chunk.choices:
|
102 |
continue
|
103 |
+
delta = chunk.choices[0].delta
|
104 |
+
content = delta.get("content", "")
|
105 |
+
audio = getattr(chunk.choices[0], "audio", [])
|
106 |
+
asr_results = getattr(chunk.choices[0], "asr_results", [])
|
107 |
+
|
108 |
+
if asr_results:
|
109 |
+
asr_result += "".join(asr_results)
|
110 |
+
yield id, None, asr_result, None, state
|
111 |
+
|
112 |
+
if content:
|
113 |
+
full_response += content
|
114 |
+
yield id, full_response, None, None, state
|
115 |
+
|
116 |
+
if audio:
|
117 |
+
# Accumulate audio bytes and yield them
|
118 |
+
audio_bytes_accumulated += b''.join([base64.b64decode(a) for a in audio])
|
119 |
+
yield id, None, None, audio_bytes_accumulated, state
|
120 |
+
|
121 |
+
yield id, full_response, asr_result, audio_bytes_accumulated, state
|
122 |
|
123 |
except Exception as e:
|
124 |
raise gr.Error(f"Error during audio streaming: {e}")
|
|
|
139 |
|
140 |
generator = generate_response_and_audio(audio_buffer.getvalue(), state)
|
141 |
|
142 |
+
for id, text, asr, audio, updated_state in generator:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
state = updated_state
|
144 |
+
if asr:
|
145 |
+
update_or_append_conversation(state.conversation, id, "user", asr)
|
146 |
+
if text:
|
147 |
+
update_or_append_conversation(state.conversation, id, "assistant", text)
|
148 |
+
chatbot_output = state.conversation
|
149 |
yield chatbot_output, audio, state
|
150 |
|
151 |
# Reset the audio stream for the next interaction
|
|
|
180 |
|
181 |
with gr.Row():
|
182 |
with gr.Column():
|
183 |
+
input_audio = gr.Audio(label="Input Audio", source="microphone", type="numpy")
|
184 |
with gr.Column():
|
185 |
chatbot = gr.Chatbot(label="Conversation", type="messages")
|
186 |
output_audio = gr.Audio(label="Output Audio", autoplay=True)
|
|
|
190 |
set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
|
191 |
format_dropdown.change(update_format, inputs=[format_dropdown, state], outputs=[state])
|
192 |
|
193 |
+
# Add a dummy output to trigger the response function
|
194 |
+
should_process_response = gr.Variable(False)
|
195 |
+
|
196 |
stream = input_audio.stream(
|
197 |
process_audio,
|
198 |
[input_audio, state],
|
199 |
+
[input_audio, state, should_process_response],
|
200 |
stream_every=0.25, # Reduced to make it more responsive
|
201 |
time_limit=60, # Increased to allow for longer messages
|
202 |
)
|
203 |
|
204 |
+
# When should_process_response is True, call response
|
205 |
+
stream.then(
|
206 |
response,
|
207 |
+
inputs=[state],
|
208 |
+
outputs=[chatbot, output_audio, state]
|
209 |
)
|
210 |
|
211 |
# Automatically restart recording after the assistant's response
|
|
|
218 |
# Add a "Stop Conversation" button
|
219 |
cancel = gr.Button("Stop Conversation", variant="stop")
|
220 |
cancel.click(lambda: (AppState(stopped=True), gr.update(recording=False)), None,
|
221 |
+
[state, input_audio], cancels=[stream, restart])
|
222 |
|
223 |
+
demo.launch(queue=True)
|