Ntabukiraniro commited on
Commit
cd5981c
1 Parent(s): 1acb220

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +438 -0
utils.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import os
5
+ import re
6
+ import subprocess
7
+ import textwrap
8
+ import time
9
+ import uuid
10
+ import wave
11
+
12
+ import emoji
13
+ import gradio as gr
14
+ import langid
15
+ import nltk
16
+ import numpy as np
17
+ import noisereduce as nr
18
+ from huggingface_hub import HfApi
19
+
20
+ # Download the 'punkt' tokenizer for the NLTK library
21
+ nltk.download("punkt")
22
+
23
+ # will use api to restart space on a unrecoverable error
24
+ HF_TOKEN = os.environ.get("HF_TOKEN")
25
+ REPO_ID = os.environ.get("REPO_ID")
26
+ api = HfApi(token=HF_TOKEN)
27
+
28
+ latent_map = {}
29
+
30
+ def get_latents(chatbot_voice, xtts_model, voice_cleanup=False):
31
+ global latent_map
32
+ if chatbot_voice not in latent_map:
33
+ speaker_wav = f"examples/{chatbot_voice}.wav"
34
+ if (voice_cleanup):
35
+ try:
36
+ cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
37
+ resample_filter="-ac 1 -ar 22050"
38
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
39
+ #we will use newer ffmpeg as that has afftn denoise filter
40
+ shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
41
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
42
+ speaker_wav=out_filename
43
+ print("Filtered microphone input")
44
+ except subprocess.CalledProcessError:
45
+ # There was an error - command exited with non-zero code
46
+ print("Error: failed filtering, use original microphone input")
47
+ else:
48
+ speaker_wav=speaker_wav
49
+ # gets condition latents from the model
50
+ # returns tuple (gpt_cond_latent, speaker_embedding)
51
+ latent_map[chatbot_voice] = xtts_model.get_conditioning_latents(audio_path=speaker_wav)
52
+ return latent_map[chatbot_voice]
53
+
54
+
55
+ def detect_language(prompt, xtts_supported_languages=None):
56
+ if xtts_supported_languages is None:
57
+ xtts_supported_languages = ["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
58
+
59
+ # Fast language autodetection
60
+ if len(prompt)>15:
61
+ language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
62
+ if language_predicted == "zh":
63
+ #we use zh-cn on xtts
64
+ language_predicted = "zh-cn"
65
+
66
+ if language_predicted not in xtts_supported_languages:
67
+ print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
68
+ gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
69
+ language= "en"
70
+ else:
71
+ language = language_predicted
72
+ print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
73
+ else:
74
+ # Hard to detect language fast in short sentence, use english default
75
+ language = "en"
76
+ print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
77
+
78
+ return language
79
+
80
+ def get_voice_streaming(prompt, language, chatbot_voice, xtts_model, suffix="0"):
81
+ gpt_cond_latent, speaker_embedding = get_latents(chatbot_voice, xtts_model)
82
+ try:
83
+ t0 = time.time()
84
+ chunks = xtts_model.inference_stream(
85
+ prompt,
86
+ language,
87
+ gpt_cond_latent,
88
+ speaker_embedding,
89
+ repetition_penalty=7.0,
90
+ temperature=0.85,
91
+ )
92
+
93
+ first_chunk = True
94
+ for i, chunk in enumerate(chunks):
95
+ if first_chunk:
96
+ first_chunk_time = time.time() - t0
97
+ metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
98
+ first_chunk = False
99
+ #print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
100
+
101
+ # In case output is required to be multiple voice files
102
+ # out_file = f'{char}_{i}.wav'
103
+ # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
104
+ # audio = AudioSegment.from_file(out_file)
105
+ # audio.export(out_file, format='wav')
106
+ # return out_file
107
+ # directly return chunk as bytes for streaming
108
+ chunk = chunk.detach().cpu().numpy().squeeze()
109
+ chunk = (chunk * 32767).astype(np.int16)
110
+ yield chunk.tobytes()
111
+
112
+ except RuntimeError as e:
113
+ if "device-side assert" in str(e):
114
+ # cannot do anything on cuda device side error, need tor estart
115
+ print(
116
+ f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
117
+ flush=True,
118
+ )
119
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
120
+ print("Cuda device-assert Runtime encountered need restart")
121
+
122
+ # HF Space specific.. This error is unrecoverable need to restart space
123
+ api.restart_space(REPO_ID=REPO_ID)
124
+ else:
125
+ print("RuntimeError: non device-side assert error:", str(e))
126
+ # Does not require warning happens on empty chunk and at end
127
+ ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
128
+ return None
129
+ return None
130
+ except:
131
+ return None
132
+
133
+ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
134
+ # This will create a wave header then append the frame input
135
+ # It should be first on a streaming wav file
136
+ # Other frames better should not have it (else you will hear some artifacts each chunk start)
137
+ wav_buf = io.BytesIO()
138
+ with wave.open(wav_buf, "wb") as vfout:
139
+ vfout.setnchannels(channels)
140
+ vfout.setsampwidth(sample_width)
141
+ vfout.setframerate(sample_rate)
142
+ vfout.writeframes(frame_input)
143
+
144
+ wav_buf.seek(0)
145
+ return wav_buf.read()
146
+
147
+ def format_prompt(message, history):
148
+ system_message = f"""
149
+ You are Interviewer, Your task is to conduct interviews. Remember, you are the interviewer, not the candidate.
150
+
151
+ Rules:
152
+
153
+ -Set a counter for the number of questions asked: num_questions = 0
154
+ -After asking each question, increment the counter: num_questions += 1
155
+ If num_questions >= 6:
156
+ You may ask additional questions as long as num_questions <= 11
157
+ If num_questions > 11:
158
+ Do not ask any further questions
159
+ -You should ask one question at a time and wait for the applicant's response before asking the next question.
160
+
161
+ -Your questions should be short and precise, including a mix of behavioral, technical, and scenario-based inquiries relevant to the job.
162
+ -If the applicant's response does not directly address the question asked or if they are not engaging, you should politely say: "Thank you for your response. However, I would appreciate if you could more directly address [restate the original question]."
163
+
164
+ -If the applicant consistently fails to provide appropriate responses after redirection, you may end the interview early by saying: "Thank you for your time, but I don't believe we'll be able to continue this interview productively."
165
+ -When concluding, ask: "Before we wrap up, is there anything else you'd like to share or any questions you have for me?" Listen to their final thoughts or questions.
166
+ -Thank the applicant again for their time and participation, appreciate their engagement, and wish them the best in their career pursuits.
167
+ -Based on the chat history, you will evaluate the applicant using the following format:
168
+ Summarization: [Summarize the conversation objectively in a short paragraph, noting if redirection was required.]
169
+
170
+ Strengths: [Highlight the applicant's strengths demonstrated across behavioral, technical, and scenario-based responses.]
171
+
172
+ Areas for Improvement: [Suggest areas where the applicant could further develop skills or knowledge, across different categories. If responses were consistently off-topic, note this.]
173
+
174
+ Score: [Provide a score out of 10 based on the applicant's overall fit for the role.]
175
+
176
+ Send the summarization to the applicant after concluding the interview.
177
+ Additional Guidelines:
178
+
179
+ -Maintain a professional and unbiased tone throughout.
180
+ -Ask open-ended questions and encourage the applicant to provide detailed responses.
181
+ -Avoid referring to the applicant as "candidate."
182
+ {{context}}
183
+ """
184
+ prompt = (
185
+ "<s>[INST]" + system_message + "[/INST]"
186
+ )
187
+ for user_prompt, bot_response in history:
188
+ if user_prompt is not None:
189
+ prompt += f"[INST] {user_prompt} [/INST]"
190
+ prompt += f" {bot_response}</s> "
191
+
192
+ if message=="":
193
+ message="Hello"
194
+ prompt += f"[INST] {message} [/INST]"
195
+ return prompt
196
+
197
+ def generate_llm_output(
198
+ prompt,
199
+ history,
200
+ llm,
201
+ temperature=0.8,
202
+ max_tokens=256,
203
+ top_p=0.95,
204
+ stop_words=["<s>","[/INST]", "</s>"]
205
+ ):
206
+ temperature = float(temperature)
207
+ if temperature < 1e-2:
208
+ temperature = 1e-2
209
+ top_p = float(top_p)
210
+
211
+ generate_kwargs = dict(
212
+ temperature=temperature,
213
+ max_tokens=max_tokens,
214
+ top_p=top_p,
215
+ stop=stop_words
216
+ )
217
+ formatted_prompt = format_prompt(prompt, history)
218
+ try:
219
+ print("LLM Input:", formatted_prompt)
220
+ # Local GGUF
221
+ stream = llm(
222
+ formatted_prompt,
223
+ **generate_kwargs,
224
+ stream=True,
225
+ )
226
+ output = ""
227
+ for response in stream:
228
+ character= response["choices"][0]["text"]
229
+
230
+ if character in stop_words:
231
+ # end of context
232
+ return
233
+
234
+ if emoji.is_emoji(character):
235
+ # Bad emoji not a meaning messes chat from next lines
236
+ return
237
+
238
+ output += response["choices"][0]["text"]
239
+ yield output
240
+
241
+ except Exception as e:
242
+ print("Unhandled Exception: ", str(e))
243
+ gr.Warning("Unfortunately Mistral is unable to process")
244
+ output = "I do not know what happened but I could not understand you ."
245
+ return output
246
+
247
+ def get_sentence(history, llm):
248
+ history = [["", None]] if history is None else history
249
+ history[-1][1] = ""
250
+ sentence_list = []
251
+ sentence_hash_list = []
252
+
253
+ text_to_generate = ""
254
+ stored_sentence = None
255
+ stored_sentence_hash = None
256
+
257
+ for character in generate_llm_output(history[-1][0], history[:-1], llm):
258
+ history[-1][1] = character.replace("<|assistant|>","")
259
+ # It is coming word by word
260
+ text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
261
+ if len(text_to_generate) > 1:
262
+
263
+ dif = len(text_to_generate) - len(sentence_list)
264
+
265
+ if dif == 1 and len(sentence_list) != 0:
266
+ continue
267
+
268
+ if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
269
+ continue
270
+
271
+ # All this complexity due to trying append first short sentence to next one for proper language auto-detect
272
+ if stored_sentence is not None and stored_sentence_hash is None and dif>1:
273
+ #means we consumed stored sentence and should look at next sentence to generate
274
+ sentence = text_to_generate[len(sentence_list)+1]
275
+ elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
276
+ print("Appending stored")
277
+ sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
278
+ stored_sentence_hash = None
279
+ else:
280
+ sentence = text_to_generate[len(sentence_list)]
281
+
282
+ # too short sentence just append to next one if there is any
283
+ # this is for proper language detection
284
+ if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
285
+ if sentence[-1] in [".","!","?"]:
286
+ if stored_sentence_hash != hash(sentence):
287
+ stored_sentence = sentence
288
+ stored_sentence_hash = hash(sentence)
289
+ print("Storing:",stored_sentence)
290
+ continue
291
+
292
+
293
+ sentence_hash = hash(sentence)
294
+ if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
295
+ continue
296
+
297
+ if sentence_hash not in sentence_hash_list:
298
+ sentence_hash_list.append(sentence_hash)
299
+ sentence_list.append(sentence)
300
+ print("New Sentence: ", sentence)
301
+ yield (sentence, history)
302
+
303
+ # return that final sentence token
304
+ try:
305
+ last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
306
+ sentence_hash = hash(last_sentence)
307
+ if sentence_hash not in sentence_hash_list:
308
+ if stored_sentence is not None and stored_sentence_hash is not None:
309
+ last_sentence = stored_sentence + last_sentence
310
+ stored_sentence = stored_sentence_hash = None
311
+ print("Last Sentence with stored:",last_sentence)
312
+
313
+ sentence_hash_list.append(sentence_hash)
314
+ sentence_list.append(last_sentence)
315
+ print("Last Sentence: ", last_sentence)
316
+
317
+ yield (last_sentence, history)
318
+ except:
319
+ print("ERROR on last sentence history is :", history)
320
+
321
+ # will generate speech audio file per sentence
322
+ def generate_speech_for_sentence(history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=None, filter_output=True, return_as_byte=False):
323
+ language = "autodetect"
324
+
325
+ wav_bytestream = b""
326
+
327
+ if len(sentence)==0:
328
+ print("EMPTY SENTENCE")
329
+ return
330
+
331
+ # Sometimes prompt </s> coming on output remove it
332
+ # Some post process for speech only
333
+ sentence = sentence.replace("</s>", "")
334
+ # remove code from speech
335
+ sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
336
+ sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
337
+
338
+ sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
339
+
340
+ sentence = sentence.replace("```", "")
341
+ sentence = sentence.replace("...", " ")
342
+ sentence = sentence.replace("(", " ")
343
+ sentence = sentence.replace(")", " ")
344
+ sentence = sentence.replace("<|assistant|>","")
345
+
346
+ if len(sentence)==0:
347
+ print("EMPTY SENTENCE after processing")
348
+ return
349
+
350
+ # A fast fix for last chacter, may produce weird sounds if it is with text
351
+ #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
352
+ # # just add a space
353
+ # sentence = sentence[:-1] + " " + sentence[-1]
354
+
355
+ # regex does the job well
356
+ sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
357
+
358
+ print("Sentence for speech:", sentence)
359
+
360
+
361
+ try:
362
+ SENTENCE_SPLIT_LENGTH=350
363
+ if len(sentence)<SENTENCE_SPLIT_LENGTH:
364
+ # no problem continue on
365
+ sentence_list = [sentence]
366
+ else:
367
+ # Until now nltk likely split sentences properly but we need additional
368
+ # check for longer sentence and split at last possible position
369
+ # Do whatever necessary, first break at hypens then spaces and then even split very long words
370
+ sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
371
+ print("SPLITTED LONG SENTENCE:",sentence_list)
372
+
373
+ for sentence in sentence_list:
374
+
375
+ if any(c.isalnum() for c in sentence):
376
+ if language=="autodetect":
377
+ #on first call autodetect, nexts sentence calls will use same language
378
+ language = detect_language(sentence, xtts_supported_languages)
379
+
380
+ #exists at least 1 alphanumeric (utf-8)
381
+ audio_stream = get_voice_streaming(
382
+ sentence, language, chatbot_voice, xtts_model
383
+ )
384
+ else:
385
+ # likely got a ' or " or some other text without alphanumeric in it
386
+ audio_stream = None
387
+
388
+ # XTTS is actually using streaming response but we are playing audio by sentence
389
+ # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
390
+ if audio_stream is not None:
391
+ frame_length = 0
392
+ for chunk in audio_stream:
393
+ try:
394
+ wav_bytestream += chunk
395
+ frame_length += len(chunk)
396
+ except:
397
+ # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
398
+ continue
399
+
400
+ # Filter output for better voice
401
+ if filter_output:
402
+ data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
403
+ float_data = data_s16 * 0.5**15
404
+ reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
405
+ wav_bytestream = (reduced_noise * 32767).astype(np.int16)
406
+ wav_bytestream = wav_bytestream.tobytes()
407
+
408
+ if audio_stream is not None:
409
+ if not return_as_byte:
410
+ audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
411
+ with wave.open(audio_unique_filename, "w") as f:
412
+ f.setnchannels(1)
413
+ # 2 bytes per sample.
414
+ f.setsampwidth(2)
415
+ f.setframerate(24000)
416
+ f.writeframes(wav_bytestream)
417
+
418
+ return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
419
+ else:
420
+ return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
421
+ except RuntimeError as e:
422
+ if "device-side assert" in str(e):
423
+ # cannot do anything on cuda device side error, need tor estart
424
+ print(
425
+ f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
426
+ flush=True,
427
+ )
428
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
429
+ print("Cuda device-assert Runtime encountered need restart")
430
+
431
+ # HF Space specific.. This error is unrecoverable need to restart space
432
+ api.restart_space(REPO_ID=REPO_ID)
433
+ else:
434
+ print("RuntimeError: non device-side assert error:", str(e))
435
+ raise e
436
+
437
+ print("All speech ended")
438
+ return