akhaliq HF staff commited on
Commit
7fa63ee
1 Parent(s): ba47ac1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -26
app.py CHANGED
@@ -55,9 +55,19 @@ def process_audio(audio: tuple, state: AppState):
55
  state.pause_detected = pause_detected
56
 
57
  if state.pause_detected:
58
- return gr.update(recording=False), state
 
59
  else:
60
- return None, state
 
 
 
 
 
 
 
 
 
61
 
62
  def generate_response_and_audio(audio_bytes: bytes, state: AppState):
63
  if state.client is None:
@@ -76,20 +86,39 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
76
  "tts_audio_bitrate": bitrate
77
  },
78
  model="llama3.1-8b",
79
- messages=[{"role": "user", "content": [{"type": "audio", "data": audio_data}]}],
80
  temperature=0.7,
81
  max_tokens=256,
82
  stream=True,
83
  )
84
 
 
 
 
 
 
85
  for chunk in stream:
86
  if not chunk.choices:
87
  continue
88
- content = chunk.choices[0].delta.content
89
- audio = getattr(chunk.choices[0], 'audio', [])
90
- if content or audio:
91
- audio_bytes = b''.join([base64.b64decode(a) for a in audio]) if audio else None
92
- yield content, audio_bytes, state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  except Exception as e:
95
  raise gr.Error(f"Error during audio streaming: {e}")
@@ -110,18 +139,13 @@ def response(state: AppState):
110
 
111
  generator = generate_response_and_audio(audio_buffer.getvalue(), state)
112
 
113
- # Add the user's audio input to the conversation
114
- state.conversation.append({"role": "user", "content": "Audio input"})
115
-
116
- # Prepare assistant's message
117
- assistant_message = {"role": "assistant", "content": ""}
118
- state.conversation.append(assistant_message)
119
-
120
- for text, audio, updated_state in generator:
121
- if text:
122
- assistant_message["content"] += text
123
  state = updated_state
124
- chatbot_output = state.conversation[-2:] # Get the last two messages
 
 
 
 
125
  yield chatbot_output, audio, state
126
 
127
  # Reset the audio stream for the next interaction
@@ -156,7 +180,7 @@ with gr.Blocks() as demo:
156
 
157
  with gr.Row():
158
  with gr.Column():
159
- input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
160
  with gr.Column():
161
  chatbot = gr.Chatbot(label="Conversation", type="messages")
162
  output_audio = gr.Audio(label="Output Audio", autoplay=True)
@@ -166,18 +190,22 @@ with gr.Blocks() as demo:
166
  set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
167
  format_dropdown.change(update_format, inputs=[format_dropdown, state], outputs=[state])
168
 
 
 
 
169
  stream = input_audio.stream(
170
  process_audio,
171
  [input_audio, state],
172
- [input_audio, state],
173
  stream_every=0.25, # Reduced to make it more responsive
174
  time_limit=60, # Increased to allow for longer messages
175
  )
176
 
177
- respond = input_audio.stop_recording(
 
178
  response,
179
- [state],
180
- [chatbot, output_audio, state],
181
  )
182
 
183
  # Automatically restart recording after the assistant's response
@@ -190,6 +218,6 @@ with gr.Blocks() as demo:
190
  # Add a "Stop Conversation" button
191
  cancel = gr.Button("Stop Conversation", variant="stop")
192
  cancel.click(lambda: (AppState(stopped=True), gr.update(recording=False)), None,
193
- [state, input_audio], cancels=[respond, restart])
194
 
195
- demo.launch()
 
55
  state.pause_detected = pause_detected
56
 
57
  if state.pause_detected:
58
+ # Stop recording and trigger response
59
+ return gr.update(recording=False), state, True
60
  else:
61
+ return None, state, False
62
+
63
+ def update_or_append_conversation(conversation, id, role, content):
64
+ # Find if there's an existing message with the given id
65
+ for message in conversation:
66
+ if message.get("id") == id and message.get("role") == role:
67
+ message["content"] = content
68
+ return
69
+ # If not found, append a new message
70
+ conversation.append({"id": id, "role": role, "content": content})
71
 
72
  def generate_response_and_audio(audio_bytes: bytes, state: AppState):
73
  if state.client is None:
 
86
  "tts_audio_bitrate": bitrate
87
  },
88
  model="llama3.1-8b",
89
+ messages=state.conversation + [{"role": "user", "content": [{"type": "audio", "data": audio_data}]}],
90
  temperature=0.7,
91
  max_tokens=256,
92
  stream=True,
93
  )
94
 
95
+ id = str(time.time())
96
+ full_response = ""
97
+ asr_result = ""
98
+ audio_bytes_accumulated = b''
99
+
100
  for chunk in stream:
101
  if not chunk.choices:
102
  continue
103
+ delta = chunk.choices[0].delta
104
+ content = delta.get("content", "")
105
+ audio = getattr(chunk.choices[0], "audio", [])
106
+ asr_results = getattr(chunk.choices[0], "asr_results", [])
107
+
108
+ if asr_results:
109
+ asr_result += "".join(asr_results)
110
+ yield id, None, asr_result, None, state
111
+
112
+ if content:
113
+ full_response += content
114
+ yield id, full_response, None, None, state
115
+
116
+ if audio:
117
+ # Accumulate audio bytes and yield them
118
+ audio_bytes_accumulated += b''.join([base64.b64decode(a) for a in audio])
119
+ yield id, None, None, audio_bytes_accumulated, state
120
+
121
+ yield id, full_response, asr_result, audio_bytes_accumulated, state
122
 
123
  except Exception as e:
124
  raise gr.Error(f"Error during audio streaming: {e}")
 
139
 
140
  generator = generate_response_and_audio(audio_buffer.getvalue(), state)
141
 
142
+ for id, text, asr, audio, updated_state in generator:
 
 
 
 
 
 
 
 
 
143
  state = updated_state
144
+ if asr:
145
+ update_or_append_conversation(state.conversation, id, "user", asr)
146
+ if text:
147
+ update_or_append_conversation(state.conversation, id, "assistant", text)
148
+ chatbot_output = state.conversation
149
  yield chatbot_output, audio, state
150
 
151
  # Reset the audio stream for the next interaction
 
180
 
181
  with gr.Row():
182
  with gr.Column():
183
+ input_audio = gr.Audio(label="Input Audio", source="microphone", type="numpy")
184
  with gr.Column():
185
  chatbot = gr.Chatbot(label="Conversation", type="messages")
186
  output_audio = gr.Audio(label="Output Audio", autoplay=True)
 
190
  set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
191
  format_dropdown.change(update_format, inputs=[format_dropdown, state], outputs=[state])
192
 
193
+ # Add a dummy output to trigger the response function
194
+ should_process_response = gr.Variable(False)
195
+
196
  stream = input_audio.stream(
197
  process_audio,
198
  [input_audio, state],
199
+ [input_audio, state, should_process_response],
200
  stream_every=0.25, # Reduced to make it more responsive
201
  time_limit=60, # Increased to allow for longer messages
202
  )
203
 
204
+ # When should_process_response is True, call response
205
+ stream.then(
206
  response,
207
+ inputs=[state],
208
+ outputs=[chatbot, output_audio, state]
209
  )
210
 
211
  # Automatically restart recording after the assistant's response
 
218
  # Add a "Stop Conversation" button
219
  cancel = gr.Button("Stop Conversation", variant="stop")
220
  cancel.click(lambda: (AppState(stopped=True), gr.update(recording=False)), None,
221
+ [state, input_audio], cancels=[stream, restart])
222
 
223
+ demo.launch(queue=True)