IliaLarchenko commited on
Commit
939b9ab
1 Parent(s): 755dbb1

Audio processing refactoring

Browse files
Files changed (1) hide show
  1. api/audio.py +253 -129
api/audio.py CHANGED
@@ -1,25 +1,31 @@
1
  import io
2
  import wave
3
-
4
  import numpy as np
5
  import requests
6
-
7
  from openai import OpenAI
8
-
9
- from utils.errors import APIError, AudioConversionError
10
- from typing import List, Optional, Generator, Tuple
11
  import webrtcvad
12
-
13
  from transformers import pipeline
 
 
14
 
 
 
15
 
16
- def detect_voice(audio: np.ndarray, sample_rate: int = 48000, frame_duration: int = 30) -> bool:
17
- vad = webrtcvad.Vad()
18
- vad.set_mode(3) # Aggressiveness mode: 0 (least aggressive) to 3 (most aggressive)
19
 
20
- # Convert numpy array to 16-bit PCM bytes
21
- audio_bytes = audio.tobytes()
 
 
 
 
 
 
22
 
 
 
 
 
 
23
  num_samples_per_frame = int(sample_rate * frame_duration / 1000)
24
  frames = [audio_bytes[i : i + num_samples_per_frame * 2] for i in range(0, len(audio_bytes), num_samples_per_frame * 2)]
25
 
@@ -35,34 +41,43 @@ def detect_voice(audio: np.ndarray, sample_rate: int = 48000, frame_duration: in
35
 
36
 
37
  class STTManager:
38
- def __init__(self, config):
39
- self.SAMPLE_RATE = 48000
40
- self.CHUNK_LENGTH = 5
41
- self.STEP_LENGTH = 3
42
- self.MAX_RELIABILITY_CUTOFF = self.CHUNK_LENGTH - 1
43
 
44
- self.config = config
45
- self.status = self.test_stt()
46
- self.streaming = self.status
47
 
 
 
 
 
 
 
 
 
 
 
48
  if config.stt.type == "HF_LOCAL":
49
  self.pipe = pipeline("automatic-speech-recognition", model=config.stt.name)
50
 
51
  def numpy_audio_to_bytes(self, audio_data: np.ndarray) -> bytes:
52
  """
53
- Convert a numpy array of audio data to bytes.
54
 
55
- :param audio_data: Numpy array containing audio data.
56
- :return: Bytes representation of the audio data.
57
- """
58
- num_channels = 1
59
- sampwidth = 2
60
 
 
 
 
61
  buffer = io.BytesIO()
62
  try:
63
  with wave.open(buffer, "wb") as wf:
64
- wf.setnchannels(num_channels)
65
- wf.setsampwidth(sampwidth)
66
  wf.setframerate(self.SAMPLE_RATE)
67
  wf.writeframes(audio_data.tobytes())
68
  except Exception as e:
@@ -71,112 +86,164 @@ class STTManager:
71
 
72
  def process_audio_chunk(self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
73
  """
74
- Process streamed audio data to accumulate and transcribe with overlapping segments.
75
 
76
- :param audio: Tuple containing the sample rate and audio data as numpy array.
77
- :param audio_buffer: Current audio buffer as numpy array.
78
- :return: Updated current audio buffer, audio for transcription
79
- """
80
 
 
 
 
81
  has_voice = detect_voice(audio[1])
82
  ended = len(audio[1]) % 24000 != 0
83
-
84
  if has_voice:
85
  audio_buffer = np.concatenate((audio_buffer, audio[1]))
86
-
87
  is_short = len(audio_buffer) / self.SAMPLE_RATE < 1.0
88
-
89
  if is_short or (has_voice and not ended):
90
  return audio_buffer, np.array([], dtype=np.int16)
91
-
92
  return np.array([], dtype=np.int16), audio_buffer
93
 
94
  def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
95
  """
96
- Convert speech to text from a full audio segment.
97
 
98
- :param audio: Numpy array containing audio data.
99
- :param text: Text message to add.
100
- :return: Transcribed text.
101
- """
102
 
 
 
 
103
  if len(audio) < 500:
104
  return text
105
- else:
106
- transcript = self.transcribe_numpy_array(audio, context=text)
107
- return text + " " + transcript
108
 
109
- def add_to_chat(self, text: str, chat: List[List[Optional[str]]], editable_chat: bool = True) -> List[List[Optional[str]]]:
110
- """
111
- Add a text message to the chat history.
112
 
113
- :param text: Text message to add.
114
- :param chat: List of chat messages.
115
- :return: Updated chat history.
116
  """
117
- if not editable_chat or len(text) == 0:
118
- return chat
119
 
120
- if len(chat) == 0 or chat[-1][0] is None:
121
- chat.append(["", None])
 
122
 
123
- chat[-1][0] = text
124
-
125
- return chat
 
 
126
 
127
- def transcribe_and_add_to_chat(self, audio: np.ndarray, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
128
  """
129
- Transcribe audio and add the transcription to the chat history.
 
 
 
 
 
130
 
131
- :param audio: Numpy array containing audio data.
132
- :param chat: List of chat messages.
133
- :return: Updated chat history.
134
  """
135
- text = self.transcribe_audio(audio)
136
- chat = self.add_to_chat(text, chat)
 
 
 
137
  return chat
138
 
139
  def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
140
  """
141
- Convert speech to text from a full audio segment.
 
 
 
 
 
 
 
142
 
143
- :param audio: Tuple containing the sample rate and audio data as numpy array.
144
- :param context: Optional context for the transcription.
145
- :return: Transcribed text.
146
  """
 
 
 
 
 
 
147
  try:
148
- if self.config.stt.type == "OPENAI_API":
149
- audio_bytes = self.numpy_audio_to_bytes(audio)
150
- data = ("temp.wav", audio_bytes, "audio/wav")
151
- client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
152
- transcription = client.audio.transcriptions.create(
153
- model=self.config.stt.name, file=data, response_format="text", prompt=context
154
- )
155
- elif self.config.stt.type == "HF_API":
156
- audio_bytes = self.numpy_audio_to_bytes(audio)
157
- headers = {"Authorization": "Bearer " + self.config.stt.key}
158
- response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)
159
- if response.status_code != 200:
160
- error_details = response.json().get("error", "No error message provided")
161
- raise APIError("STT Error: HF API error", status_code=response.status_code, details=error_details)
162
- transcription = response.json().get("text", None)
163
- if transcription is None:
164
- raise APIError("STT Error: No transcription returned by HF API")
165
- elif self.config.stt.type == "HF_LOCAL":
166
- result = self.pipe({"sampling_rate": self.SAMPLE_RATE, "raw": audio.astype(np.float32) / 32768.0})
167
- transcription = result["text"]
168
- except APIError:
169
- raise
170
  except Exception as e:
171
  raise APIError(f"STT Error: Unexpected error: {e}")
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  return transcription
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  def test_stt(self) -> bool:
176
  """
177
- Test if the STT service is working correctly.
178
 
179
- :return: True if the STT service is working, False otherwise.
 
180
  """
181
  try:
182
  self.transcribe_audio(np.zeros(10000))
@@ -186,15 +253,29 @@ class STTManager:
186
 
187
 
188
  class TTSManager:
189
- def __init__(self, config):
 
 
 
 
 
 
 
 
190
  self.config = config
191
- self.status = self.test_tts(stream=False)
192
- self.streaming = self.test_tts(stream=True) if self.status else False
 
193
 
194
- def test_tts(self, stream) -> bool:
195
  """
196
- Test if the TTS service is working correctly.
197
- :return: True if the TTS service is working, False otherwise.
 
 
 
 
 
198
  """
199
  try:
200
  list(self.read_text("Handshake", stream=stream))
@@ -204,52 +285,95 @@ class TTSManager:
204
 
205
  def read_text(self, text: str, stream: Optional[bool] = None) -> Generator[bytes, None, None]:
206
  """
207
- Convert text to speech and return the audio bytes, optionally streaming the response.
208
- :param text: Text to convert to speech.
209
- :param stream: Whether to use streaming or not.
210
- :return: Generator yielding chunks of audio bytes.
211
- """
 
 
 
212
 
 
 
 
213
  if not text:
214
  yield b""
215
  return
216
 
217
- if stream is None:
218
- stream = self.streaming
219
 
220
- headers = {"Authorization": "Bearer " + self.config.tts.key}
221
  data = {"model": self.config.tts.name, "input": text, "voice": "alloy", "response_format": "opus"}
222
 
223
  try:
224
- if not stream:
225
- if self.config.tts.type == "OPENAI_API":
226
- response = requests.post(self.config.tts.url + "/audio/speech", headers=headers, json=data)
227
- elif self.config.tts.type == "HF_API":
228
- response = requests.post(self.config.tts.url, headers=headers, json={"inputs": text})
229
-
230
- if response.status_code != 200:
231
- error_details = response.json().get("error", "No error message provided")
232
- raise APIError(f"TTS Error: {self.config.tts.type} error", status_code=response.status_code, details=error_details)
233
- yield response.content
234
- else:
235
- if self.config.tts.type != "OPENAI_API":
236
- raise APIError("TTS Error: Streaming not supported for this TTS type")
237
-
238
- with requests.post(self.config.tts.url + "/audio/speech", headers=headers, json=data, stream=True) as response:
239
- if response.status_code != 200:
240
- error_details = response.json().get("error", "No error message provided")
241
- raise APIError("TTS Error: OPENAI API error", status_code=response.status_code, details=error_details)
242
- yield from response.iter_content(chunk_size=1024)
243
  except APIError:
244
  raise
245
  except Exception as e:
246
  raise APIError(f"TTS Error: Unexpected error: {e}")
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  def read_last_message(self, chat_history: List[List[Optional[str]]]) -> Generator[bytes, None, None]:
249
  """
250
- Read the last message in the chat history and convert it to speech.
251
- :param chat_history: List of chat messages.
252
- :return: Generator yielding chunks of audio bytes.
 
 
 
 
253
  """
254
- if len(chat_history) > 0 and chat_history[-1][1]:
255
  yield from self.read_text(chat_history[-1][1])
 
1
  import io
2
  import wave
 
3
  import numpy as np
4
  import requests
 
5
  from openai import OpenAI
 
 
 
6
  import webrtcvad
 
7
  from transformers import pipeline
8
+ from typing import List, Optional, Generator, Tuple, Any
9
+ from utils.errors import APIError, AudioConversionError
10
 
11
+ SAMPLE_RATE: int = 48000
12
+ FRAME_DURATION: int = 30
13
 
 
 
 
14
 
15
+ def detect_voice(audio: np.ndarray, sample_rate: int = SAMPLE_RATE, frame_duration: int = FRAME_DURATION) -> bool:
16
+ """
17
+ Detect voice activity in the given audio data.
18
+
19
+ Args:
20
+ audio (np.ndarray): Audio data as a numpy array.
21
+ sample_rate (int): Sample rate of the audio. Defaults to SAMPLE_RATE.
22
+ frame_duration (int): Duration of each frame in milliseconds. Defaults to FRAME_DURATION.
23
 
24
+ Returns:
25
+ bool: True if voice activity is detected, False otherwise.
26
+ """
27
+ vad = webrtcvad.Vad(3) # Aggressiveness mode: 3 (most aggressive)
28
+ audio_bytes = audio.tobytes()
29
  num_samples_per_frame = int(sample_rate * frame_duration / 1000)
30
  frames = [audio_bytes[i : i + num_samples_per_frame * 2] for i in range(0, len(audio_bytes), num_samples_per_frame * 2)]
31
 
 
41
 
42
 
43
  class STTManager:
44
+ """Manages speech-to-text operations."""
 
 
 
 
45
 
46
+ def __init__(self, config: Any):
47
+ """
48
+ Initialize the STTManager.
49
 
50
+ Args:
51
+ config (Any): Configuration object containing STT settings.
52
+ """
53
+ self.config = config
54
+ self.SAMPLE_RATE: int = SAMPLE_RATE
55
+ self.CHUNK_LENGTH: int = 5
56
+ self.STEP_LENGTH: int = 3
57
+ self.MAX_RELIABILITY_CUTOFF: int = self.CHUNK_LENGTH - 1
58
+ self.status: bool = self.test_stt()
59
+ self.streaming: bool = self.status
60
  if config.stt.type == "HF_LOCAL":
61
  self.pipe = pipeline("automatic-speech-recognition", model=config.stt.name)
62
 
63
  def numpy_audio_to_bytes(self, audio_data: np.ndarray) -> bytes:
64
  """
65
+ Convert numpy array audio data to bytes.
66
 
67
+ Args:
68
+ audio_data (np.ndarray): Audio data as a numpy array.
69
+
70
+ Returns:
71
+ bytes: Audio data as bytes.
72
 
73
+ Raises:
74
+ AudioConversionError: If there's an error during conversion.
75
+ """
76
  buffer = io.BytesIO()
77
  try:
78
  with wave.open(buffer, "wb") as wf:
79
+ wf.setnchannels(1)
80
+ wf.setsampwidth(2)
81
  wf.setframerate(self.SAMPLE_RATE)
82
  wf.writeframes(audio_data.tobytes())
83
  except Exception as e:
 
86
 
87
  def process_audio_chunk(self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
88
  """
89
+ Process an audio chunk and update the audio buffer.
90
 
91
+ Args:
92
+ audio (Tuple[int, np.ndarray]): Audio chunk data.
93
+ audio_buffer (np.ndarray): Existing audio buffer.
 
94
 
95
+ Returns:
96
+ Tuple[np.ndarray, np.ndarray]: Updated audio buffer and processed audio.
97
+ """
98
  has_voice = detect_voice(audio[1])
99
  ended = len(audio[1]) % 24000 != 0
 
100
  if has_voice:
101
  audio_buffer = np.concatenate((audio_buffer, audio[1]))
 
102
  is_short = len(audio_buffer) / self.SAMPLE_RATE < 1.0
 
103
  if is_short or (has_voice and not ended):
104
  return audio_buffer, np.array([], dtype=np.int16)
 
105
  return np.array([], dtype=np.int16), audio_buffer
106
 
107
  def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
108
  """
109
+ Transcribe audio data and append to existing text.
110
 
111
+ Args:
112
+ audio (np.ndarray): Audio data to transcribe.
113
+ text (str): Existing text to append to. Defaults to empty string.
 
114
 
115
+ Returns:
116
+ str: Transcribed text appended to existing text.
117
+ """
118
  if len(audio) < 500:
119
  return text
120
+ transcript = self.transcribe_numpy_array(audio, context=text)
 
 
121
 
122
+ return f"{text} {transcript}".strip()
 
 
123
 
124
+ def transcribe_and_add_to_chat(self, audio: np.ndarray, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
 
 
125
  """
126
+ Transcribe audio and add the result to the chat history.
 
127
 
128
+ Args:
129
+ audio (np.ndarray): Audio data to transcribe.
130
+ chat (List[List[Optional[str]]]): Existing chat history.
131
 
132
+ Returns:
133
+ List[List[Optional[str]]]: Updated chat history with transcribed text.
134
+ """
135
+ text = self.transcribe_audio(audio)
136
+ return self.add_to_chat(text, chat)
137
 
138
+ def add_to_chat(self, text: str, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
139
  """
140
+ Add text to the chat history.
141
+
142
+ Args:
143
+ text (str): Text to add to chat.
144
+ chat (List[List[Optional[str]]]): Existing chat history.
145
+ editable_chat (bool): Whether the chat is editable. Defaults to True.
146
 
147
+ Returns:
148
+ List[List[Optional[str]]]: Updated chat history.
 
149
  """
150
+ if not text:
151
+ return chat
152
+ if not chat or chat[-1][0] is None:
153
+ chat.append(["", None])
154
+ chat[-1][0] = text
155
  return chat
156
 
157
  def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
158
  """
159
+ Transcribe audio data using the configured STT service.
160
+
161
+ Args:
162
+ audio (np.ndarray): Audio data as a numpy array.
163
+ context (Optional[str]): Optional context for transcription.
164
+
165
+ Returns:
166
+ str: Transcribed text.
167
 
168
+ Raises:
169
+ APIError: If there's an unexpected error during transcription.
 
170
  """
171
+ transcription_methods = {
172
+ "OPENAI_API": self._transcribe_openai,
173
+ "HF_API": self._transcribe_hf_api,
174
+ "HF_LOCAL": self._transcribe_hf_local,
175
+ }
176
+
177
  try:
178
+ transcribe_method = transcription_methods.get(self.config.stt.type)
179
+ if transcribe_method:
180
+ return transcribe_method(audio, context)
181
+ else:
182
+ raise APIError(f"Unsupported STT type: {self.config.stt.type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  except Exception as e:
184
  raise APIError(f"STT Error: Unexpected error: {e}")
185
 
186
+ def _transcribe_openai(self, audio: np.ndarray, context: Optional[str]) -> str:
187
+ """
188
+ Transcribe audio using OpenAI API.
189
+
190
+ Args:
191
+ audio (np.ndarray): Audio data as a numpy array.
192
+ context (Optional[str]): Optional context for transcription.
193
+
194
+ Returns:
195
+ str: Transcribed text.
196
+ """
197
+ audio_bytes = self.numpy_audio_to_bytes(audio)
198
+ data = ("temp.wav", audio_bytes, "audio/wav")
199
+ client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
200
+ return client.audio.transcriptions.create(model=self.config.stt.name, file=data, response_format="text", prompt=context)
201
+
202
+ def _transcribe_hf_api(self, audio: np.ndarray, _context: Optional[str]) -> str:
203
+ """
204
+ Transcribe audio using Hugging Face API.
205
+
206
+ Args:
207
+ audio (np.ndarray): Audio data as a numpy array.
208
+ _context (Optional[str]): Unused context parameter.
209
+
210
+ Returns:
211
+ str: Transcribed text.
212
+
213
+ Raises:
214
+ APIError: If there's an error in the API response.
215
+ """
216
+ audio_bytes = self.numpy_audio_to_bytes(audio)
217
+ headers = {"Authorization": f"Bearer {self.config.stt.key}"}
218
+ response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)
219
+ if response.status_code != 200:
220
+ error_details = response.json().get("error", "No error message provided")
221
+ raise APIError("STT Error: HF API error", status_code=response.status_code, details=error_details)
222
+ transcription = response.json().get("text")
223
+ if transcription is None:
224
+ raise APIError("STT Error: No transcription returned by HF API")
225
  return transcription
226
 
227
+ def _transcribe_hf_local(self, audio: np.ndarray, _context: Optional[str]) -> str:
228
+ """
229
+ Transcribe audio using local Hugging Face model.
230
+
231
+ Args:
232
+ audio (np.ndarray): Audio data as a numpy array.
233
+ _context (Optional[str]): Unused context parameter.
234
+
235
+ Returns:
236
+ str: Transcribed text.
237
+ """
238
+ result = self.pipe({"sampling_rate": self.SAMPLE_RATE, "raw": audio.astype(np.float32) / 32768.0})
239
+ return result["text"]
240
+
241
  def test_stt(self) -> bool:
242
  """
243
+ Test the STT functionality.
244
 
245
+ Returns:
246
+ bool: True if the test is successful, False otherwise.
247
  """
248
  try:
249
  self.transcribe_audio(np.zeros(10000))
 
253
 
254
 
255
  class TTSManager:
256
+ """Manages text-to-speech operations."""
257
+
258
+ def __init__(self, config: Any):
259
+ """
260
+ Initialize the TTSManager.
261
+
262
+ Args:
263
+ config (Any): Configuration object containing TTS settings.
264
+ """
265
  self.config = config
266
+ self.SAMPLE_RATE: int = SAMPLE_RATE
267
+ self.status: bool = self.test_tts(stream=False)
268
+ self.streaming: bool = self.test_tts(stream=True) if self.status else False
269
 
270
+ def test_tts(self, stream: bool) -> bool:
271
  """
272
+ Test the TTS functionality.
273
+
274
+ Args:
275
+ stream (bool): Whether to test streaming TTS.
276
+
277
+ Returns:
278
+ bool: True if the test is successful, False otherwise.
279
  """
280
  try:
281
  list(self.read_text("Handshake", stream=stream))
 
285
 
286
  def read_text(self, text: str, stream: Optional[bool] = None) -> Generator[bytes, None, None]:
287
  """
288
+ Convert text to speech using the configured TTS service.
289
+
290
+ Args:
291
+ text (str): Text to convert to speech.
292
+ stream (Optional[bool]): Whether to stream the audio. Defaults to self.streaming if not provided.
293
+
294
+ Yields:
295
+ bytes: Audio data in bytes.
296
 
297
+ Raises:
298
+ APIError: If there's an unexpected error during text-to-speech conversion.
299
+ """
300
  if not text:
301
  yield b""
302
  return
303
 
304
+ stream = self.streaming if stream is None else stream
 
305
 
306
+ headers = {"Authorization": f"Bearer {self.config.tts.key}"}
307
  data = {"model": self.config.tts.name, "input": text, "voice": "alloy", "response_format": "opus"}
308
 
309
  try:
310
+ yield from self._read_text_stream(headers, data) if stream else self._read_text_non_stream(headers, data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  except APIError:
312
  raise
313
  except Exception as e:
314
  raise APIError(f"TTS Error: Unexpected error: {e}")
315
 
316
+ def _read_text_non_stream(self, headers: dict, data: dict) -> Generator[bytes, None, None]:
317
+ """
318
+ Handle non-streaming TTS requests.
319
+
320
+ Args:
321
+ headers (dict): Request headers.
322
+ data (dict): Request data.
323
+
324
+ Yields:
325
+ bytes: Audio data in bytes.
326
+
327
+ Raises:
328
+ APIError: If there's an error in the API response.
329
+ """
330
+ if self.config.tts.type == "OPENAI_API":
331
+ url = f"{self.config.tts.url}/audio/speech"
332
+ elif self.config.tts.type == "HF_API":
333
+ url = self.config.tts.url
334
+ data = {"inputs": data["input"]}
335
+ else:
336
+ raise APIError(f"TTS Error: Unsupported TTS type: {self.config.tts.type}")
337
+
338
+ response = requests.post(url, headers=headers, json=data)
339
+ if response.status_code != 200:
340
+ error_details = response.json().get("error", "No error message provided")
341
+ raise APIError(f"TTS Error: {self.config.tts.type} error", status_code=response.status_code, details=error_details)
342
+ yield response.content
343
+
344
+ def _read_text_stream(self, headers: dict, data: dict) -> Generator[bytes, None, None]:
345
+ """
346
+ Handle streaming TTS requests.
347
+
348
+ Args:
349
+ headers (dict): Request headers.
350
+ data (dict): Request data.
351
+
352
+ Yields:
353
+ bytes: Audio data in bytes.
354
+
355
+ Raises:
356
+ APIError: If there's an error in the API response or if streaming is not supported.
357
+ """
358
+ if self.config.tts.type != "OPENAI_API":
359
+ raise APIError("TTS Error: Streaming not supported for this TTS type")
360
+
361
+ url = f"{self.config.tts.url}/audio/speech"
362
+ with requests.post(url, headers=headers, json=data, stream=True) as response:
363
+ if response.status_code != 200:
364
+ error_details = response.json().get("error", "No error message provided")
365
+ raise APIError("TTS Error: OPENAI API error", status_code=response.status_code, details=error_details)
366
+ yield from response.iter_content(chunk_size=1024)
367
+
368
  def read_last_message(self, chat_history: List[List[Optional[str]]]) -> Generator[bytes, None, None]:
369
  """
370
+ Read the last message in the chat history.
371
+
372
+ Args:
373
+ chat_history (List[List[Optional[str]]]): Chat history.
374
+
375
+ Yields:
376
+ bytes: Audio data for the last message.
377
  """
378
+ if chat_history and chat_history[-1][1]:
379
  yield from self.read_text(chat_history[-1][1])