Spaces:
Sleeping
Sleeping
IliaLarchenko
commited on
Commit
•
939b9ab
1
Parent(s):
755dbb1
Audio processing refactoring
Browse files- api/audio.py +253 -129
api/audio.py
CHANGED
@@ -1,25 +1,31 @@
|
|
1 |
import io
|
2 |
import wave
|
3 |
-
|
4 |
import numpy as np
|
5 |
import requests
|
6 |
-
|
7 |
from openai import OpenAI
|
8 |
-
|
9 |
-
from utils.errors import APIError, AudioConversionError
|
10 |
-
from typing import List, Optional, Generator, Tuple
|
11 |
import webrtcvad
|
12 |
-
|
13 |
from transformers import pipeline
|
|
|
|
|
14 |
|
|
|
|
|
15 |
|
16 |
-
def detect_voice(audio: np.ndarray, sample_rate: int = 48000, frame_duration: int = 30) -> bool:
|
17 |
-
vad = webrtcvad.Vad()
|
18 |
-
vad.set_mode(3) # Aggressiveness mode: 0 (least aggressive) to 3 (most aggressive)
|
19 |
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
|
|
|
|
|
|
|
|
|
|
23 |
num_samples_per_frame = int(sample_rate * frame_duration / 1000)
|
24 |
frames = [audio_bytes[i : i + num_samples_per_frame * 2] for i in range(0, len(audio_bytes), num_samples_per_frame * 2)]
|
25 |
|
@@ -35,34 +41,43 @@ def detect_voice(audio: np.ndarray, sample_rate: int = 48000, frame_duration: in
|
|
35 |
|
36 |
|
37 |
class STTManager:
|
38 |
-
|
39 |
-
self.SAMPLE_RATE = 48000
|
40 |
-
self.CHUNK_LENGTH = 5
|
41 |
-
self.STEP_LENGTH = 3
|
42 |
-
self.MAX_RELIABILITY_CUTOFF = self.CHUNK_LENGTH - 1
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
if config.stt.type == "HF_LOCAL":
|
49 |
self.pipe = pipeline("automatic-speech-recognition", model=config.stt.name)
|
50 |
|
51 |
def numpy_audio_to_bytes(self, audio_data: np.ndarray) -> bytes:
|
52 |
"""
|
53 |
-
Convert
|
54 |
|
55 |
-
:
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
|
|
|
|
|
|
|
61 |
buffer = io.BytesIO()
|
62 |
try:
|
63 |
with wave.open(buffer, "wb") as wf:
|
64 |
-
wf.setnchannels(
|
65 |
-
wf.setsampwidth(
|
66 |
wf.setframerate(self.SAMPLE_RATE)
|
67 |
wf.writeframes(audio_data.tobytes())
|
68 |
except Exception as e:
|
@@ -71,112 +86,164 @@ class STTManager:
|
|
71 |
|
72 |
def process_audio_chunk(self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
73 |
"""
|
74 |
-
Process
|
75 |
|
76 |
-
:
|
77 |
-
|
78 |
-
|
79 |
-
"""
|
80 |
|
|
|
|
|
|
|
81 |
has_voice = detect_voice(audio[1])
|
82 |
ended = len(audio[1]) % 24000 != 0
|
83 |
-
|
84 |
if has_voice:
|
85 |
audio_buffer = np.concatenate((audio_buffer, audio[1]))
|
86 |
-
|
87 |
is_short = len(audio_buffer) / self.SAMPLE_RATE < 1.0
|
88 |
-
|
89 |
if is_short or (has_voice and not ended):
|
90 |
return audio_buffer, np.array([], dtype=np.int16)
|
91 |
-
|
92 |
return np.array([], dtype=np.int16), audio_buffer
|
93 |
|
94 |
def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
|
95 |
"""
|
96 |
-
|
97 |
|
98 |
-
:
|
99 |
-
|
100 |
-
|
101 |
-
"""
|
102 |
|
|
|
|
|
|
|
103 |
if len(audio) < 500:
|
104 |
return text
|
105 |
-
|
106 |
-
transcript = self.transcribe_numpy_array(audio, context=text)
|
107 |
-
return text + " " + transcript
|
108 |
|
109 |
-
|
110 |
-
"""
|
111 |
-
Add a text message to the chat history.
|
112 |
|
113 |
-
|
114 |
-
:param chat: List of chat messages.
|
115 |
-
:return: Updated chat history.
|
116 |
"""
|
117 |
-
|
118 |
-
return chat
|
119 |
|
120 |
-
|
121 |
-
|
|
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
126 |
|
127 |
-
def
|
128 |
"""
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
-
:
|
132 |
-
|
133 |
-
:return: Updated chat history.
|
134 |
"""
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
137 |
return chat
|
138 |
|
139 |
def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
|
140 |
"""
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
-
:
|
144 |
-
|
145 |
-
:return: Transcribed text.
|
146 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
try:
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
model=self.config.stt.name, file=data, response_format="text", prompt=context
|
154 |
-
)
|
155 |
-
elif self.config.stt.type == "HF_API":
|
156 |
-
audio_bytes = self.numpy_audio_to_bytes(audio)
|
157 |
-
headers = {"Authorization": "Bearer " + self.config.stt.key}
|
158 |
-
response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)
|
159 |
-
if response.status_code != 200:
|
160 |
-
error_details = response.json().get("error", "No error message provided")
|
161 |
-
raise APIError("STT Error: HF API error", status_code=response.status_code, details=error_details)
|
162 |
-
transcription = response.json().get("text", None)
|
163 |
-
if transcription is None:
|
164 |
-
raise APIError("STT Error: No transcription returned by HF API")
|
165 |
-
elif self.config.stt.type == "HF_LOCAL":
|
166 |
-
result = self.pipe({"sampling_rate": self.SAMPLE_RATE, "raw": audio.astype(np.float32) / 32768.0})
|
167 |
-
transcription = result["text"]
|
168 |
-
except APIError:
|
169 |
-
raise
|
170 |
except Exception as e:
|
171 |
raise APIError(f"STT Error: Unexpected error: {e}")
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
return transcription
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
def test_stt(self) -> bool:
|
176 |
"""
|
177 |
-
Test
|
178 |
|
179 |
-
:
|
|
|
180 |
"""
|
181 |
try:
|
182 |
self.transcribe_audio(np.zeros(10000))
|
@@ -186,15 +253,29 @@ class STTManager:
|
|
186 |
|
187 |
|
188 |
class TTSManager:
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
self.config = config
|
191 |
-
self.
|
192 |
-
self.
|
|
|
193 |
|
194 |
-
def test_tts(self, stream) -> bool:
|
195 |
"""
|
196 |
-
Test
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
198 |
"""
|
199 |
try:
|
200 |
list(self.read_text("Handshake", stream=stream))
|
@@ -204,52 +285,95 @@ class TTSManager:
|
|
204 |
|
205 |
def read_text(self, text: str, stream: Optional[bool] = None) -> Generator[bytes, None, None]:
|
206 |
"""
|
207 |
-
Convert text to speech
|
208 |
-
|
209 |
-
:
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
212 |
|
|
|
|
|
|
|
213 |
if not text:
|
214 |
yield b""
|
215 |
return
|
216 |
|
217 |
-
if stream is None
|
218 |
-
stream = self.streaming
|
219 |
|
220 |
-
headers = {"Authorization": "Bearer
|
221 |
data = {"model": self.config.tts.name, "input": text, "voice": "alloy", "response_format": "opus"}
|
222 |
|
223 |
try:
|
224 |
-
if
|
225 |
-
if self.config.tts.type == "OPENAI_API":
|
226 |
-
response = requests.post(self.config.tts.url + "/audio/speech", headers=headers, json=data)
|
227 |
-
elif self.config.tts.type == "HF_API":
|
228 |
-
response = requests.post(self.config.tts.url, headers=headers, json={"inputs": text})
|
229 |
-
|
230 |
-
if response.status_code != 200:
|
231 |
-
error_details = response.json().get("error", "No error message provided")
|
232 |
-
raise APIError(f"TTS Error: {self.config.tts.type} error", status_code=response.status_code, details=error_details)
|
233 |
-
yield response.content
|
234 |
-
else:
|
235 |
-
if self.config.tts.type != "OPENAI_API":
|
236 |
-
raise APIError("TTS Error: Streaming not supported for this TTS type")
|
237 |
-
|
238 |
-
with requests.post(self.config.tts.url + "/audio/speech", headers=headers, json=data, stream=True) as response:
|
239 |
-
if response.status_code != 200:
|
240 |
-
error_details = response.json().get("error", "No error message provided")
|
241 |
-
raise APIError("TTS Error: OPENAI API error", status_code=response.status_code, details=error_details)
|
242 |
-
yield from response.iter_content(chunk_size=1024)
|
243 |
except APIError:
|
244 |
raise
|
245 |
except Exception as e:
|
246 |
raise APIError(f"TTS Error: Unexpected error: {e}")
|
247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
def read_last_message(self, chat_history: List[List[Optional[str]]]) -> Generator[bytes, None, None]:
|
249 |
"""
|
250 |
-
Read the last message in the chat history
|
251 |
-
|
252 |
-
:
|
|
|
|
|
|
|
|
|
253 |
"""
|
254 |
-
if
|
255 |
yield from self.read_text(chat_history[-1][1])
|
|
|
1 |
import io
|
2 |
import wave
|
|
|
3 |
import numpy as np
|
4 |
import requests
|
|
|
5 |
from openai import OpenAI
|
|
|
|
|
|
|
6 |
import webrtcvad
|
|
|
7 |
from transformers import pipeline
|
8 |
+
from typing import List, Optional, Generator, Tuple, Any
|
9 |
+
from utils.errors import APIError, AudioConversionError
|
10 |
|
11 |
+
SAMPLE_RATE: int = 48000
|
12 |
+
FRAME_DURATION: int = 30
|
13 |
|
|
|
|
|
|
|
14 |
|
15 |
+
def detect_voice(audio: np.ndarray, sample_rate: int = SAMPLE_RATE, frame_duration: int = FRAME_DURATION) -> bool:
|
16 |
+
"""
|
17 |
+
Detect voice activity in the given audio data.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
audio (np.ndarray): Audio data as a numpy array.
|
21 |
+
sample_rate (int): Sample rate of the audio. Defaults to SAMPLE_RATE.
|
22 |
+
frame_duration (int): Duration of each frame in milliseconds. Defaults to FRAME_DURATION.
|
23 |
|
24 |
+
Returns:
|
25 |
+
bool: True if voice activity is detected, False otherwise.
|
26 |
+
"""
|
27 |
+
vad = webrtcvad.Vad(3) # Aggressiveness mode: 3 (most aggressive)
|
28 |
+
audio_bytes = audio.tobytes()
|
29 |
num_samples_per_frame = int(sample_rate * frame_duration / 1000)
|
30 |
frames = [audio_bytes[i : i + num_samples_per_frame * 2] for i in range(0, len(audio_bytes), num_samples_per_frame * 2)]
|
31 |
|
|
|
41 |
|
42 |
|
43 |
class STTManager:
|
44 |
+
"""Manages speech-to-text operations."""
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
def __init__(self, config: Any):
|
47 |
+
"""
|
48 |
+
Initialize the STTManager.
|
49 |
|
50 |
+
Args:
|
51 |
+
config (Any): Configuration object containing STT settings.
|
52 |
+
"""
|
53 |
+
self.config = config
|
54 |
+
self.SAMPLE_RATE: int = SAMPLE_RATE
|
55 |
+
self.CHUNK_LENGTH: int = 5
|
56 |
+
self.STEP_LENGTH: int = 3
|
57 |
+
self.MAX_RELIABILITY_CUTOFF: int = self.CHUNK_LENGTH - 1
|
58 |
+
self.status: bool = self.test_stt()
|
59 |
+
self.streaming: bool = self.status
|
60 |
if config.stt.type == "HF_LOCAL":
|
61 |
self.pipe = pipeline("automatic-speech-recognition", model=config.stt.name)
|
62 |
|
63 |
def numpy_audio_to_bytes(self, audio_data: np.ndarray) -> bytes:
|
64 |
"""
|
65 |
+
Convert numpy array audio data to bytes.
|
66 |
|
67 |
+
Args:
|
68 |
+
audio_data (np.ndarray): Audio data as a numpy array.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
bytes: Audio data as bytes.
|
72 |
|
73 |
+
Raises:
|
74 |
+
AudioConversionError: If there's an error during conversion.
|
75 |
+
"""
|
76 |
buffer = io.BytesIO()
|
77 |
try:
|
78 |
with wave.open(buffer, "wb") as wf:
|
79 |
+
wf.setnchannels(1)
|
80 |
+
wf.setsampwidth(2)
|
81 |
wf.setframerate(self.SAMPLE_RATE)
|
82 |
wf.writeframes(audio_data.tobytes())
|
83 |
except Exception as e:
|
|
|
86 |
|
87 |
def process_audio_chunk(self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
88 |
"""
|
89 |
+
Process an audio chunk and update the audio buffer.
|
90 |
|
91 |
+
Args:
|
92 |
+
audio (Tuple[int, np.ndarray]): Audio chunk data.
|
93 |
+
audio_buffer (np.ndarray): Existing audio buffer.
|
|
|
94 |
|
95 |
+
Returns:
|
96 |
+
Tuple[np.ndarray, np.ndarray]: Updated audio buffer and processed audio.
|
97 |
+
"""
|
98 |
has_voice = detect_voice(audio[1])
|
99 |
ended = len(audio[1]) % 24000 != 0
|
|
|
100 |
if has_voice:
|
101 |
audio_buffer = np.concatenate((audio_buffer, audio[1]))
|
|
|
102 |
is_short = len(audio_buffer) / self.SAMPLE_RATE < 1.0
|
|
|
103 |
if is_short or (has_voice and not ended):
|
104 |
return audio_buffer, np.array([], dtype=np.int16)
|
|
|
105 |
return np.array([], dtype=np.int16), audio_buffer
|
106 |
|
107 |
def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
|
108 |
"""
|
109 |
+
Transcribe audio data and append to existing text.
|
110 |
|
111 |
+
Args:
|
112 |
+
audio (np.ndarray): Audio data to transcribe.
|
113 |
+
text (str): Existing text to append to. Defaults to empty string.
|
|
|
114 |
|
115 |
+
Returns:
|
116 |
+
str: Transcribed text appended to existing text.
|
117 |
+
"""
|
118 |
if len(audio) < 500:
|
119 |
return text
|
120 |
+
transcript = self.transcribe_numpy_array(audio, context=text)
|
|
|
|
|
121 |
|
122 |
+
return f"{text} {transcript}".strip()
|
|
|
|
|
123 |
|
124 |
+
def transcribe_and_add_to_chat(self, audio: np.ndarray, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
|
|
|
|
|
125 |
"""
|
126 |
+
Transcribe audio and add the result to the chat history.
|
|
|
127 |
|
128 |
+
Args:
|
129 |
+
audio (np.ndarray): Audio data to transcribe.
|
130 |
+
chat (List[List[Optional[str]]]): Existing chat history.
|
131 |
|
132 |
+
Returns:
|
133 |
+
List[List[Optional[str]]]: Updated chat history with transcribed text.
|
134 |
+
"""
|
135 |
+
text = self.transcribe_audio(audio)
|
136 |
+
return self.add_to_chat(text, chat)
|
137 |
|
138 |
+
def add_to_chat(self, text: str, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
|
139 |
"""
|
140 |
+
Add text to the chat history.
|
141 |
+
|
142 |
+
Args:
|
143 |
+
text (str): Text to add to chat.
|
144 |
+
chat (List[List[Optional[str]]]): Existing chat history.
|
145 |
+
editable_chat (bool): Whether the chat is editable. Defaults to True.
|
146 |
|
147 |
+
Returns:
|
148 |
+
List[List[Optional[str]]]: Updated chat history.
|
|
|
149 |
"""
|
150 |
+
if not text:
|
151 |
+
return chat
|
152 |
+
if not chat or chat[-1][0] is None:
|
153 |
+
chat.append(["", None])
|
154 |
+
chat[-1][0] = text
|
155 |
return chat
|
156 |
|
157 |
def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
|
158 |
"""
|
159 |
+
Transcribe audio data using the configured STT service.
|
160 |
+
|
161 |
+
Args:
|
162 |
+
audio (np.ndarray): Audio data as a numpy array.
|
163 |
+
context (Optional[str]): Optional context for transcription.
|
164 |
+
|
165 |
+
Returns:
|
166 |
+
str: Transcribed text.
|
167 |
|
168 |
+
Raises:
|
169 |
+
APIError: If there's an unexpected error during transcription.
|
|
|
170 |
"""
|
171 |
+
transcription_methods = {
|
172 |
+
"OPENAI_API": self._transcribe_openai,
|
173 |
+
"HF_API": self._transcribe_hf_api,
|
174 |
+
"HF_LOCAL": self._transcribe_hf_local,
|
175 |
+
}
|
176 |
+
|
177 |
try:
|
178 |
+
transcribe_method = transcription_methods.get(self.config.stt.type)
|
179 |
+
if transcribe_method:
|
180 |
+
return transcribe_method(audio, context)
|
181 |
+
else:
|
182 |
+
raise APIError(f"Unsupported STT type: {self.config.stt.type}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
except Exception as e:
|
184 |
raise APIError(f"STT Error: Unexpected error: {e}")
|
185 |
|
186 |
+
def _transcribe_openai(self, audio: np.ndarray, context: Optional[str]) -> str:
|
187 |
+
"""
|
188 |
+
Transcribe audio using OpenAI API.
|
189 |
+
|
190 |
+
Args:
|
191 |
+
audio (np.ndarray): Audio data as a numpy array.
|
192 |
+
context (Optional[str]): Optional context for transcription.
|
193 |
+
|
194 |
+
Returns:
|
195 |
+
str: Transcribed text.
|
196 |
+
"""
|
197 |
+
audio_bytes = self.numpy_audio_to_bytes(audio)
|
198 |
+
data = ("temp.wav", audio_bytes, "audio/wav")
|
199 |
+
client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
|
200 |
+
return client.audio.transcriptions.create(model=self.config.stt.name, file=data, response_format="text", prompt=context)
|
201 |
+
|
202 |
+
def _transcribe_hf_api(self, audio: np.ndarray, _context: Optional[str]) -> str:
|
203 |
+
"""
|
204 |
+
Transcribe audio using Hugging Face API.
|
205 |
+
|
206 |
+
Args:
|
207 |
+
audio (np.ndarray): Audio data as a numpy array.
|
208 |
+
_context (Optional[str]): Unused context parameter.
|
209 |
+
|
210 |
+
Returns:
|
211 |
+
str: Transcribed text.
|
212 |
+
|
213 |
+
Raises:
|
214 |
+
APIError: If there's an error in the API response.
|
215 |
+
"""
|
216 |
+
audio_bytes = self.numpy_audio_to_bytes(audio)
|
217 |
+
headers = {"Authorization": f"Bearer {self.config.stt.key}"}
|
218 |
+
response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)
|
219 |
+
if response.status_code != 200:
|
220 |
+
error_details = response.json().get("error", "No error message provided")
|
221 |
+
raise APIError("STT Error: HF API error", status_code=response.status_code, details=error_details)
|
222 |
+
transcription = response.json().get("text")
|
223 |
+
if transcription is None:
|
224 |
+
raise APIError("STT Error: No transcription returned by HF API")
|
225 |
return transcription
|
226 |
|
227 |
+
def _transcribe_hf_local(self, audio: np.ndarray, _context: Optional[str]) -> str:
|
228 |
+
"""
|
229 |
+
Transcribe audio using local Hugging Face model.
|
230 |
+
|
231 |
+
Args:
|
232 |
+
audio (np.ndarray): Audio data as a numpy array.
|
233 |
+
_context (Optional[str]): Unused context parameter.
|
234 |
+
|
235 |
+
Returns:
|
236 |
+
str: Transcribed text.
|
237 |
+
"""
|
238 |
+
result = self.pipe({"sampling_rate": self.SAMPLE_RATE, "raw": audio.astype(np.float32) / 32768.0})
|
239 |
+
return result["text"]
|
240 |
+
|
241 |
def test_stt(self) -> bool:
|
242 |
"""
|
243 |
+
Test the STT functionality.
|
244 |
|
245 |
+
Returns:
|
246 |
+
bool: True if the test is successful, False otherwise.
|
247 |
"""
|
248 |
try:
|
249 |
self.transcribe_audio(np.zeros(10000))
|
|
|
253 |
|
254 |
|
255 |
class TTSManager:
|
256 |
+
"""Manages text-to-speech operations."""
|
257 |
+
|
258 |
+
def __init__(self, config: Any):
|
259 |
+
"""
|
260 |
+
Initialize the TTSManager.
|
261 |
+
|
262 |
+
Args:
|
263 |
+
config (Any): Configuration object containing TTS settings.
|
264 |
+
"""
|
265 |
self.config = config
|
266 |
+
self.SAMPLE_RATE: int = SAMPLE_RATE
|
267 |
+
self.status: bool = self.test_tts(stream=False)
|
268 |
+
self.streaming: bool = self.test_tts(stream=True) if self.status else False
|
269 |
|
270 |
+
def test_tts(self, stream: bool) -> bool:
|
271 |
"""
|
272 |
+
Test the TTS functionality.
|
273 |
+
|
274 |
+
Args:
|
275 |
+
stream (bool): Whether to test streaming TTS.
|
276 |
+
|
277 |
+
Returns:
|
278 |
+
bool: True if the test is successful, False otherwise.
|
279 |
"""
|
280 |
try:
|
281 |
list(self.read_text("Handshake", stream=stream))
|
|
|
285 |
|
286 |
def read_text(self, text: str, stream: Optional[bool] = None) -> Generator[bytes, None, None]:
|
287 |
"""
|
288 |
+
Convert text to speech using the configured TTS service.
|
289 |
+
|
290 |
+
Args:
|
291 |
+
text (str): Text to convert to speech.
|
292 |
+
stream (Optional[bool]): Whether to stream the audio. Defaults to self.streaming if not provided.
|
293 |
+
|
294 |
+
Yields:
|
295 |
+
bytes: Audio data in bytes.
|
296 |
|
297 |
+
Raises:
|
298 |
+
APIError: If there's an unexpected error during text-to-speech conversion.
|
299 |
+
"""
|
300 |
if not text:
|
301 |
yield b""
|
302 |
return
|
303 |
|
304 |
+
stream = self.streaming if stream is None else stream
|
|
|
305 |
|
306 |
+
headers = {"Authorization": f"Bearer {self.config.tts.key}"}
|
307 |
data = {"model": self.config.tts.name, "input": text, "voice": "alloy", "response_format": "opus"}
|
308 |
|
309 |
try:
|
310 |
+
yield from self._read_text_stream(headers, data) if stream else self._read_text_non_stream(headers, data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
except APIError:
|
312 |
raise
|
313 |
except Exception as e:
|
314 |
raise APIError(f"TTS Error: Unexpected error: {e}")
|
315 |
|
316 |
+
def _read_text_non_stream(self, headers: dict, data: dict) -> Generator[bytes, None, None]:
|
317 |
+
"""
|
318 |
+
Handle non-streaming TTS requests.
|
319 |
+
|
320 |
+
Args:
|
321 |
+
headers (dict): Request headers.
|
322 |
+
data (dict): Request data.
|
323 |
+
|
324 |
+
Yields:
|
325 |
+
bytes: Audio data in bytes.
|
326 |
+
|
327 |
+
Raises:
|
328 |
+
APIError: If there's an error in the API response.
|
329 |
+
"""
|
330 |
+
if self.config.tts.type == "OPENAI_API":
|
331 |
+
url = f"{self.config.tts.url}/audio/speech"
|
332 |
+
elif self.config.tts.type == "HF_API":
|
333 |
+
url = self.config.tts.url
|
334 |
+
data = {"inputs": data["input"]}
|
335 |
+
else:
|
336 |
+
raise APIError(f"TTS Error: Unsupported TTS type: {self.config.tts.type}")
|
337 |
+
|
338 |
+
response = requests.post(url, headers=headers, json=data)
|
339 |
+
if response.status_code != 200:
|
340 |
+
error_details = response.json().get("error", "No error message provided")
|
341 |
+
raise APIError(f"TTS Error: {self.config.tts.type} error", status_code=response.status_code, details=error_details)
|
342 |
+
yield response.content
|
343 |
+
|
344 |
+
def _read_text_stream(self, headers: dict, data: dict) -> Generator[bytes, None, None]:
|
345 |
+
"""
|
346 |
+
Handle streaming TTS requests.
|
347 |
+
|
348 |
+
Args:
|
349 |
+
headers (dict): Request headers.
|
350 |
+
data (dict): Request data.
|
351 |
+
|
352 |
+
Yields:
|
353 |
+
bytes: Audio data in bytes.
|
354 |
+
|
355 |
+
Raises:
|
356 |
+
APIError: If there's an error in the API response or if streaming is not supported.
|
357 |
+
"""
|
358 |
+
if self.config.tts.type != "OPENAI_API":
|
359 |
+
raise APIError("TTS Error: Streaming not supported for this TTS type")
|
360 |
+
|
361 |
+
url = f"{self.config.tts.url}/audio/speech"
|
362 |
+
with requests.post(url, headers=headers, json=data, stream=True) as response:
|
363 |
+
if response.status_code != 200:
|
364 |
+
error_details = response.json().get("error", "No error message provided")
|
365 |
+
raise APIError("TTS Error: OPENAI API error", status_code=response.status_code, details=error_details)
|
366 |
+
yield from response.iter_content(chunk_size=1024)
|
367 |
+
|
368 |
def read_last_message(self, chat_history: List[List[Optional[str]]]) -> Generator[bytes, None, None]:
|
369 |
"""
|
370 |
+
Read the last message in the chat history.
|
371 |
+
|
372 |
+
Args:
|
373 |
+
chat_history (List[List[Optional[str]]]): Chat history.
|
374 |
+
|
375 |
+
Yields:
|
376 |
+
bytes: Audio data for the last message.
|
377 |
"""
|
378 |
+
if chat_history and chat_history[-1][1]:
|
379 |
yield from self.read_text(chat_history[-1][1])
|