Spaces:
Sleeping
Sleeping
Ntabukiraniro
commited on
Commit
•
cd5981c
1
Parent(s):
1acb220
Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import io
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import subprocess
|
7 |
+
import textwrap
|
8 |
+
import time
|
9 |
+
import uuid
|
10 |
+
import wave
|
11 |
+
|
12 |
+
import emoji
|
13 |
+
import gradio as gr
|
14 |
+
import langid
|
15 |
+
import nltk
|
16 |
+
import numpy as np
|
17 |
+
import noisereduce as nr
|
18 |
+
from huggingface_hub import HfApi
|
19 |
+
|
20 |
+
# Download the 'punkt' tokenizer for the NLTK library
|
21 |
+
nltk.download("punkt")
|
22 |
+
|
23 |
+
# will use api to restart space on a unrecoverable error
|
24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
25 |
+
REPO_ID = os.environ.get("REPO_ID")
|
26 |
+
api = HfApi(token=HF_TOKEN)
|
27 |
+
|
28 |
+
latent_map = {}
|
29 |
+
|
30 |
+
def get_latents(chatbot_voice, xtts_model, voice_cleanup=False):
|
31 |
+
global latent_map
|
32 |
+
if chatbot_voice not in latent_map:
|
33 |
+
speaker_wav = f"examples/{chatbot_voice}.wav"
|
34 |
+
if (voice_cleanup):
|
35 |
+
try:
|
36 |
+
cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
|
37 |
+
resample_filter="-ac 1 -ar 22050"
|
38 |
+
out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
|
39 |
+
#we will use newer ffmpeg as that has afftn denoise filter
|
40 |
+
shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
|
41 |
+
command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
|
42 |
+
speaker_wav=out_filename
|
43 |
+
print("Filtered microphone input")
|
44 |
+
except subprocess.CalledProcessError:
|
45 |
+
# There was an error - command exited with non-zero code
|
46 |
+
print("Error: failed filtering, use original microphone input")
|
47 |
+
else:
|
48 |
+
speaker_wav=speaker_wav
|
49 |
+
# gets condition latents from the model
|
50 |
+
# returns tuple (gpt_cond_latent, speaker_embedding)
|
51 |
+
latent_map[chatbot_voice] = xtts_model.get_conditioning_latents(audio_path=speaker_wav)
|
52 |
+
return latent_map[chatbot_voice]
|
53 |
+
|
54 |
+
|
55 |
+
def detect_language(prompt, xtts_supported_languages=None):
|
56 |
+
if xtts_supported_languages is None:
|
57 |
+
xtts_supported_languages = ["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
|
58 |
+
|
59 |
+
# Fast language autodetection
|
60 |
+
if len(prompt)>15:
|
61 |
+
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
62 |
+
if language_predicted == "zh":
|
63 |
+
#we use zh-cn on xtts
|
64 |
+
language_predicted = "zh-cn"
|
65 |
+
|
66 |
+
if language_predicted not in xtts_supported_languages:
|
67 |
+
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
|
68 |
+
gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
|
69 |
+
language= "en"
|
70 |
+
else:
|
71 |
+
language = language_predicted
|
72 |
+
print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
|
73 |
+
else:
|
74 |
+
# Hard to detect language fast in short sentence, use english default
|
75 |
+
language = "en"
|
76 |
+
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
77 |
+
|
78 |
+
return language
|
79 |
+
|
80 |
+
def get_voice_streaming(prompt, language, chatbot_voice, xtts_model, suffix="0"):
|
81 |
+
gpt_cond_latent, speaker_embedding = get_latents(chatbot_voice, xtts_model)
|
82 |
+
try:
|
83 |
+
t0 = time.time()
|
84 |
+
chunks = xtts_model.inference_stream(
|
85 |
+
prompt,
|
86 |
+
language,
|
87 |
+
gpt_cond_latent,
|
88 |
+
speaker_embedding,
|
89 |
+
repetition_penalty=7.0,
|
90 |
+
temperature=0.85,
|
91 |
+
)
|
92 |
+
|
93 |
+
first_chunk = True
|
94 |
+
for i, chunk in enumerate(chunks):
|
95 |
+
if first_chunk:
|
96 |
+
first_chunk_time = time.time() - t0
|
97 |
+
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
98 |
+
first_chunk = False
|
99 |
+
#print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
100 |
+
|
101 |
+
# In case output is required to be multiple voice files
|
102 |
+
# out_file = f'{char}_{i}.wav'
|
103 |
+
# write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
|
104 |
+
# audio = AudioSegment.from_file(out_file)
|
105 |
+
# audio.export(out_file, format='wav')
|
106 |
+
# return out_file
|
107 |
+
# directly return chunk as bytes for streaming
|
108 |
+
chunk = chunk.detach().cpu().numpy().squeeze()
|
109 |
+
chunk = (chunk * 32767).astype(np.int16)
|
110 |
+
yield chunk.tobytes()
|
111 |
+
|
112 |
+
except RuntimeError as e:
|
113 |
+
if "device-side assert" in str(e):
|
114 |
+
# cannot do anything on cuda device side error, need tor estart
|
115 |
+
print(
|
116 |
+
f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
|
117 |
+
flush=True,
|
118 |
+
)
|
119 |
+
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
120 |
+
print("Cuda device-assert Runtime encountered need restart")
|
121 |
+
|
122 |
+
# HF Space specific.. This error is unrecoverable need to restart space
|
123 |
+
api.restart_space(REPO_ID=REPO_ID)
|
124 |
+
else:
|
125 |
+
print("RuntimeError: non device-side assert error:", str(e))
|
126 |
+
# Does not require warning happens on empty chunk and at end
|
127 |
+
###gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
128 |
+
return None
|
129 |
+
return None
|
130 |
+
except:
|
131 |
+
return None
|
132 |
+
|
133 |
+
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
|
134 |
+
# This will create a wave header then append the frame input
|
135 |
+
# It should be first on a streaming wav file
|
136 |
+
# Other frames better should not have it (else you will hear some artifacts each chunk start)
|
137 |
+
wav_buf = io.BytesIO()
|
138 |
+
with wave.open(wav_buf, "wb") as vfout:
|
139 |
+
vfout.setnchannels(channels)
|
140 |
+
vfout.setsampwidth(sample_width)
|
141 |
+
vfout.setframerate(sample_rate)
|
142 |
+
vfout.writeframes(frame_input)
|
143 |
+
|
144 |
+
wav_buf.seek(0)
|
145 |
+
return wav_buf.read()
|
146 |
+
|
147 |
+
def format_prompt(message, history):
|
148 |
+
system_message = f"""
|
149 |
+
You are Interviewer, Your task is to conduct interviews. Remember, you are the interviewer, not the candidate.
|
150 |
+
|
151 |
+
Rules:
|
152 |
+
|
153 |
+
-Set a counter for the number of questions asked: num_questions = 0
|
154 |
+
-After asking each question, increment the counter: num_questions += 1
|
155 |
+
If num_questions >= 6:
|
156 |
+
You may ask additional questions as long as num_questions <= 11
|
157 |
+
If num_questions > 11:
|
158 |
+
Do not ask any further questions
|
159 |
+
-You should ask one question at a time and wait for the applicant's response before asking the next question.
|
160 |
+
|
161 |
+
-Your questions should be short and precise, including a mix of behavioral, technical, and scenario-based inquiries relevant to the job.
|
162 |
+
-If the applicant's response does not directly address the question asked or if they are not engaging, you should politely say: "Thank you for your response. However, I would appreciate if you could more directly address [restate the original question]."
|
163 |
+
|
164 |
+
-If the applicant consistently fails to provide appropriate responses after redirection, you may end the interview early by saying: "Thank you for your time, but I don't believe we'll be able to continue this interview productively."
|
165 |
+
-When concluding, ask: "Before we wrap up, is there anything else you'd like to share or any questions you have for me?" Listen to their final thoughts or questions.
|
166 |
+
-Thank the applicant again for their time and participation, appreciate their engagement, and wish them the best in their career pursuits.
|
167 |
+
-Based on the chat history, you will evaluate the applicant using the following format:
|
168 |
+
Summarization: [Summarize the conversation objectively in a short paragraph, noting if redirection was required.]
|
169 |
+
|
170 |
+
Strengths: [Highlight the applicant's strengths demonstrated across behavioral, technical, and scenario-based responses.]
|
171 |
+
|
172 |
+
Areas for Improvement: [Suggest areas where the applicant could further develop skills or knowledge, across different categories. If responses were consistently off-topic, note this.]
|
173 |
+
|
174 |
+
Score: [Provide a score out of 10 based on the applicant's overall fit for the role.]
|
175 |
+
|
176 |
+
Send the summarization to the applicant after concluding the interview.
|
177 |
+
Additional Guidelines:
|
178 |
+
|
179 |
+
-Maintain a professional and unbiased tone throughout.
|
180 |
+
-Ask open-ended questions and encourage the applicant to provide detailed responses.
|
181 |
+
-Avoid referring to the applicant as "candidate."
|
182 |
+
{{context}}
|
183 |
+
"""
|
184 |
+
prompt = (
|
185 |
+
"<s>[INST]" + system_message + "[/INST]"
|
186 |
+
)
|
187 |
+
for user_prompt, bot_response in history:
|
188 |
+
if user_prompt is not None:
|
189 |
+
prompt += f"[INST] {user_prompt} [/INST]"
|
190 |
+
prompt += f" {bot_response}</s> "
|
191 |
+
|
192 |
+
if message=="":
|
193 |
+
message="Hello"
|
194 |
+
prompt += f"[INST] {message} [/INST]"
|
195 |
+
return prompt
|
196 |
+
|
197 |
+
def generate_llm_output(
|
198 |
+
prompt,
|
199 |
+
history,
|
200 |
+
llm,
|
201 |
+
temperature=0.8,
|
202 |
+
max_tokens=256,
|
203 |
+
top_p=0.95,
|
204 |
+
stop_words=["<s>","[/INST]", "</s>"]
|
205 |
+
):
|
206 |
+
temperature = float(temperature)
|
207 |
+
if temperature < 1e-2:
|
208 |
+
temperature = 1e-2
|
209 |
+
top_p = float(top_p)
|
210 |
+
|
211 |
+
generate_kwargs = dict(
|
212 |
+
temperature=temperature,
|
213 |
+
max_tokens=max_tokens,
|
214 |
+
top_p=top_p,
|
215 |
+
stop=stop_words
|
216 |
+
)
|
217 |
+
formatted_prompt = format_prompt(prompt, history)
|
218 |
+
try:
|
219 |
+
print("LLM Input:", formatted_prompt)
|
220 |
+
# Local GGUF
|
221 |
+
stream = llm(
|
222 |
+
formatted_prompt,
|
223 |
+
**generate_kwargs,
|
224 |
+
stream=True,
|
225 |
+
)
|
226 |
+
output = ""
|
227 |
+
for response in stream:
|
228 |
+
character= response["choices"][0]["text"]
|
229 |
+
|
230 |
+
if character in stop_words:
|
231 |
+
# end of context
|
232 |
+
return
|
233 |
+
|
234 |
+
if emoji.is_emoji(character):
|
235 |
+
# Bad emoji not a meaning messes chat from next lines
|
236 |
+
return
|
237 |
+
|
238 |
+
output += response["choices"][0]["text"]
|
239 |
+
yield output
|
240 |
+
|
241 |
+
except Exception as e:
|
242 |
+
print("Unhandled Exception: ", str(e))
|
243 |
+
gr.Warning("Unfortunately Mistral is unable to process")
|
244 |
+
output = "I do not know what happened but I could not understand you ."
|
245 |
+
return output
|
246 |
+
|
247 |
+
def get_sentence(history, llm):
|
248 |
+
history = [["", None]] if history is None else history
|
249 |
+
history[-1][1] = ""
|
250 |
+
sentence_list = []
|
251 |
+
sentence_hash_list = []
|
252 |
+
|
253 |
+
text_to_generate = ""
|
254 |
+
stored_sentence = None
|
255 |
+
stored_sentence_hash = None
|
256 |
+
|
257 |
+
for character in generate_llm_output(history[-1][0], history[:-1], llm):
|
258 |
+
history[-1][1] = character.replace("<|assistant|>","")
|
259 |
+
# It is coming word by word
|
260 |
+
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
|
261 |
+
if len(text_to_generate) > 1:
|
262 |
+
|
263 |
+
dif = len(text_to_generate) - len(sentence_list)
|
264 |
+
|
265 |
+
if dif == 1 and len(sentence_list) != 0:
|
266 |
+
continue
|
267 |
+
|
268 |
+
if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
|
269 |
+
continue
|
270 |
+
|
271 |
+
# All this complexity due to trying append first short sentence to next one for proper language auto-detect
|
272 |
+
if stored_sentence is not None and stored_sentence_hash is None and dif>1:
|
273 |
+
#means we consumed stored sentence and should look at next sentence to generate
|
274 |
+
sentence = text_to_generate[len(sentence_list)+1]
|
275 |
+
elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
|
276 |
+
print("Appending stored")
|
277 |
+
sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
|
278 |
+
stored_sentence_hash = None
|
279 |
+
else:
|
280 |
+
sentence = text_to_generate[len(sentence_list)]
|
281 |
+
|
282 |
+
# too short sentence just append to next one if there is any
|
283 |
+
# this is for proper language detection
|
284 |
+
if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
|
285 |
+
if sentence[-1] in [".","!","?"]:
|
286 |
+
if stored_sentence_hash != hash(sentence):
|
287 |
+
stored_sentence = sentence
|
288 |
+
stored_sentence_hash = hash(sentence)
|
289 |
+
print("Storing:",stored_sentence)
|
290 |
+
continue
|
291 |
+
|
292 |
+
|
293 |
+
sentence_hash = hash(sentence)
|
294 |
+
if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
|
295 |
+
continue
|
296 |
+
|
297 |
+
if sentence_hash not in sentence_hash_list:
|
298 |
+
sentence_hash_list.append(sentence_hash)
|
299 |
+
sentence_list.append(sentence)
|
300 |
+
print("New Sentence: ", sentence)
|
301 |
+
yield (sentence, history)
|
302 |
+
|
303 |
+
# return that final sentence token
|
304 |
+
try:
|
305 |
+
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
|
306 |
+
sentence_hash = hash(last_sentence)
|
307 |
+
if sentence_hash not in sentence_hash_list:
|
308 |
+
if stored_sentence is not None and stored_sentence_hash is not None:
|
309 |
+
last_sentence = stored_sentence + last_sentence
|
310 |
+
stored_sentence = stored_sentence_hash = None
|
311 |
+
print("Last Sentence with stored:",last_sentence)
|
312 |
+
|
313 |
+
sentence_hash_list.append(sentence_hash)
|
314 |
+
sentence_list.append(last_sentence)
|
315 |
+
print("Last Sentence: ", last_sentence)
|
316 |
+
|
317 |
+
yield (last_sentence, history)
|
318 |
+
except:
|
319 |
+
print("ERROR on last sentence history is :", history)
|
320 |
+
|
321 |
+
# will generate speech audio file per sentence
|
322 |
+
def generate_speech_for_sentence(history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=None, filter_output=True, return_as_byte=False):
|
323 |
+
language = "autodetect"
|
324 |
+
|
325 |
+
wav_bytestream = b""
|
326 |
+
|
327 |
+
if len(sentence)==0:
|
328 |
+
print("EMPTY SENTENCE")
|
329 |
+
return
|
330 |
+
|
331 |
+
# Sometimes prompt </s> coming on output remove it
|
332 |
+
# Some post process for speech only
|
333 |
+
sentence = sentence.replace("</s>", "")
|
334 |
+
# remove code from speech
|
335 |
+
sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
|
336 |
+
sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
|
337 |
+
|
338 |
+
sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
|
339 |
+
|
340 |
+
sentence = sentence.replace("```", "")
|
341 |
+
sentence = sentence.replace("...", " ")
|
342 |
+
sentence = sentence.replace("(", " ")
|
343 |
+
sentence = sentence.replace(")", " ")
|
344 |
+
sentence = sentence.replace("<|assistant|>","")
|
345 |
+
|
346 |
+
if len(sentence)==0:
|
347 |
+
print("EMPTY SENTENCE after processing")
|
348 |
+
return
|
349 |
+
|
350 |
+
# A fast fix for last chacter, may produce weird sounds if it is with text
|
351 |
+
#if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
|
352 |
+
# # just add a space
|
353 |
+
# sentence = sentence[:-1] + " " + sentence[-1]
|
354 |
+
|
355 |
+
# regex does the job well
|
356 |
+
sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
|
357 |
+
|
358 |
+
print("Sentence for speech:", sentence)
|
359 |
+
|
360 |
+
|
361 |
+
try:
|
362 |
+
SENTENCE_SPLIT_LENGTH=350
|
363 |
+
if len(sentence)<SENTENCE_SPLIT_LENGTH:
|
364 |
+
# no problem continue on
|
365 |
+
sentence_list = [sentence]
|
366 |
+
else:
|
367 |
+
# Until now nltk likely split sentences properly but we need additional
|
368 |
+
# check for longer sentence and split at last possible position
|
369 |
+
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
370 |
+
sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
|
371 |
+
print("SPLITTED LONG SENTENCE:",sentence_list)
|
372 |
+
|
373 |
+
for sentence in sentence_list:
|
374 |
+
|
375 |
+
if any(c.isalnum() for c in sentence):
|
376 |
+
if language=="autodetect":
|
377 |
+
#on first call autodetect, nexts sentence calls will use same language
|
378 |
+
language = detect_language(sentence, xtts_supported_languages)
|
379 |
+
|
380 |
+
#exists at least 1 alphanumeric (utf-8)
|
381 |
+
audio_stream = get_voice_streaming(
|
382 |
+
sentence, language, chatbot_voice, xtts_model
|
383 |
+
)
|
384 |
+
else:
|
385 |
+
# likely got a ' or " or some other text without alphanumeric in it
|
386 |
+
audio_stream = None
|
387 |
+
|
388 |
+
# XTTS is actually using streaming response but we are playing audio by sentence
|
389 |
+
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
390 |
+
if audio_stream is not None:
|
391 |
+
frame_length = 0
|
392 |
+
for chunk in audio_stream:
|
393 |
+
try:
|
394 |
+
wav_bytestream += chunk
|
395 |
+
frame_length += len(chunk)
|
396 |
+
except:
|
397 |
+
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
398 |
+
continue
|
399 |
+
|
400 |
+
# Filter output for better voice
|
401 |
+
if filter_output:
|
402 |
+
data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
|
403 |
+
float_data = data_s16 * 0.5**15
|
404 |
+
reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
|
405 |
+
wav_bytestream = (reduced_noise * 32767).astype(np.int16)
|
406 |
+
wav_bytestream = wav_bytestream.tobytes()
|
407 |
+
|
408 |
+
if audio_stream is not None:
|
409 |
+
if not return_as_byte:
|
410 |
+
audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
|
411 |
+
with wave.open(audio_unique_filename, "w") as f:
|
412 |
+
f.setnchannels(1)
|
413 |
+
# 2 bytes per sample.
|
414 |
+
f.setsampwidth(2)
|
415 |
+
f.setframerate(24000)
|
416 |
+
f.writeframes(wav_bytestream)
|
417 |
+
|
418 |
+
return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
|
419 |
+
else:
|
420 |
+
return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
|
421 |
+
except RuntimeError as e:
|
422 |
+
if "device-side assert" in str(e):
|
423 |
+
# cannot do anything on cuda device side error, need tor estart
|
424 |
+
print(
|
425 |
+
f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
|
426 |
+
flush=True,
|
427 |
+
)
|
428 |
+
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
429 |
+
print("Cuda device-assert Runtime encountered need restart")
|
430 |
+
|
431 |
+
# HF Space specific.. This error is unrecoverable need to restart space
|
432 |
+
api.restart_space(REPO_ID=REPO_ID)
|
433 |
+
else:
|
434 |
+
print("RuntimeError: non device-side assert error:", str(e))
|
435 |
+
raise e
|
436 |
+
|
437 |
+
print("All speech ended")
|
438 |
+
return
|