import speech_recognition as sr from gtts import gTTS import gradio as gr from io import BytesIO import numpy as np from dataclasses import dataclass, field import time from pydub import AudioSegment import librosa from utils.vad import get_speech_timestamps, collect_chunks, VadOptions from PIL import Image from ClassPrompt import PromptClass import render creator_prompt = PromptClass() r = sr.Recognizer() @dataclass class AppState: stream: np.ndarray | None = None sampling_rate: int = 0 pause_detected: bool = False started_talking: bool = False stopped: bool = False history: list = field(default_factory=list) typing: bool = False painting:bool = False image_out:Image.Image = None image_in:Image = None conversation:list = field(default_factory=list) recording: bool = False # Thêm thuộc tính recording pause_threshold: float = 1 # Thêm thuộc tính pause_threshold def run_vad(ori_audio, sr): _st = time.time() try: audio = ori_audio audio = audio.astype(np.float32) / 32768.0 sampling_rate = 16000 audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate) vad_parameters = {} vad_parameters = VadOptions(**vad_parameters) speech_chunks = get_speech_timestamps(audio, vad_parameters) audio = collect_chunks(audio, speech_chunks) duration_after_vad = audio.shape[0] / sampling_rate # Khai báo và tính toán duration_after_vad vad_audio = audio vad_audio = np.round(vad_audio * 32768.0).astype(np.int16) vad_audio_bytes = vad_audio.tobytes() return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4) except Exception as e: return -1, ori_audio, round(time.time() - _st, 4) def determine_pause(audio:np.ndarray,sampling_rate:int,state:AppState) -> bool: """Phát hiện tạm dừng trong âm thanh.""" temp_audio = audio dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate) duration = len(audio) / sampling_rate if dur_vad > 0.5 and not state.started_talking: print("started talking") state.started_talking = True return False print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s") return (duration - dur_vad) > state.pause_threshold # Sử dụng state.pause_threshold def process_audio(audio:tuple,state:AppState,image:Image): if state.recording: # Kiểm tra state.stream: if state.stream: state.stream = np.concatenate((state.stream, audio[1])) else: state.stream = audio[1] state.sampling_rate = audio[0] state.image_in=image pause_detected = determine_pause(state.stream, state.sampling_rate, state) state.pause_detected = pause_detected if state.pause_detected and state.started_talking: state.started_talking = False state.recording = False return state, gr.Audio(recording=False) return state, None def transcribe_audio(audio_segment): audio_buffer = BytesIO() audio_segment.export(audio_buffer, format="wav") audio_buffer.seek(0) try: with sr.AudioFile(audio_buffer) as source: r.adjust_for_ambient_noise(source) text = r.recognize_google(r.record(source), language='vi') return text except sr.UnknownValueError: print("Could not understand audio.") except sr.RequestError as e: print(f"Could not request results from Google Speech Recognition service; {e}") return "" def chat_with_onlinemodel(user_input, state:AppState): state.history.append({"role": "user", "content": user_input}) response = creator_prompt.chat(provider="SambaNova", model="Meta-Llama-3.1-405B-Instruct", input_text=state.history) bot_response = response characters = bot_response.replace("*","") state.history.append({"role": "assistant", "content": characters}) state.conversation.append({"role": "user", "content":"Bạn: " + user_input}) state.conversation.append({"role": "assistant", "content":"Bot: " + characters}) return characters, state def synthesize_speech(text): """Chuyển đổi text sang giọng nói bằng gTTS.""" try: mp3 = gTTS(text, tld='com.vn', lang='vi', slow=False) mp3_fp = BytesIO() mp3.write_to_fp(mp3_fp) audio_bytes = mp3_fp.getvalue() mp3_fp.close() return audio_bytes # Chỉ trả về audio_bytes except Exception as e: print(f"Lỗi tổng hợp giọng nói: {e}") return None def response_audio(state:AppState): """Xử lý yêu cầu và tạo phản hồi.""" if not state.pause_detected and not state.started_talking: return state, None textin="" audio_segment = AudioSegment( state.stream.tobytes(), frame_rate=state.sampling_rate, sample_width=state.stream.dtype.itemsize, channels=1 if state.stream.ndim == 1 else state.stream.shape[1] ) textin = transcribe_audio(audio_segment) state.stream = None if state.typing is False: txt,state = chuyen_trangthai(textin, state) if txt == True: return state, synthesize_speech("chuyển sang trạng thái dùng bàn phím") if textin != "": paint=state.painting state.painting = text_check(textin, state.painting) if paint != state.painting: return state, synthesize_speech("Đã chuyển sang chế độ " + ("vẽ" if state.painting else "nói chuyện")) if state.painting is True: promptx = prompt_hugingface(textin,"Hugging Face","Qwen/Qwen2.5-72B-Instruct","Medium") if state.image_in: img=resize(state.image_in) else: img=None state.image_out = render.generate_images(textin, img) audio_bytes = synthesize_speech("Bạn thấy tôi vẽ "+textin+" có đẹp không") return state, audio_bytes else: print("Đang nghĩ...") text_out, state = chat_with_onlinemodel(textin,state) audio_bytes = synthesize_speech(text_out) return state, audio_bytes else: return state, synthesize_speech("Tôi nghe không rõ") # Trả về thông báo lỗi nếu synthesize_speech thất bại def response_text(state:AppState,textin,image:Image, prompt, progress=gr.Progress(track_tqdm=True)): """Xử lý yêu cầu và tạo phản hồi.""" #state.recording = False # Dừng ghi âm if state.typing is True: txt,state = chuyen_trangthai(textin, state) if txt == False: return state, synthesize_speech("chuyển sang trạng thái nói") if textin != "": paint=state.painting state.painting = text_check(textin, state.painting) if paint != state.painting: return state, synthesize_speech("Đã chuyển sang chế độ " + ("vẽ" if state.painting else "nói chuyện")) if state.painting is True: state.conversation.append({"role": "user", "content":"Bạn: " + textin}) #state.image_out = generate_image(textin, image, streng, ckpt,guidance) if image: img=resize(image) else: img=None image_out = render.generate_images(textin, img) state.image_out = image_out audio_bytes = synthesize_speech("Bạn thấy tôi vẽ "+prompt+" có đẹp không") return state, audio_bytes else: print("Đang nghĩ...") text_out, state = chat_with_onlinemodel(textin,state=state) audio_bytes = synthesize_speech(text_out) return state, audio_bytes else: return state, synthesize_speech("Hãy gõ nội dung") # Trả về thông báo lỗi nếu synthesize_speech thất bại def text_check(textin, painting): if not painting: return "sang chế độ vẽ" in textin return "sang chế độ nói" not in textin def chuyen_trangthai(textin, state:AppState): if "muốn nói chuyện" in textin: state.started_talking = False state.recording = True state.stopped=False state.typing = False return False, state elif "dùng bàn phím" in textin: state.started_talking = False state.recording = False state.stopped=True state.typing = True return True, state else: return state.typing, state def start_recording_user(state:AppState,progress=gr.Progress(track_tqdm=True)): # Sửa lỗi tại đây state.stopped = False # Cho phép bắt đầu ghi âm lại nếu đang ở trạng thái recording state.started_talking = False state.recording = True return gr.Audio(recording=True), state def restart_recording(state:AppState): # Sửa lỗi tại đây if not state.stopped: # Cho phép bắt đầu ghi âm lại nếu đang ở trạng thái recording state.started_talking = False state.recording = True return gr.Audio(recording=True), state else: state.started_talking = False state.recording = False return gr.Audio(recording=False), state def prompt_hugingface(prompt,llm_provider,model,type): result = creator_prompt.generate( input_text=prompt, long_talk=True, compress=True, compression_level="hard", poster=False, prompt_type=type, # Use the updated prompt_type here custom_base_prompt="", provider=llm_provider, model=model ) output = result return output def resize(img:Image.Image): height = (img.height // 8) * 8 width = (img.width // 8) * 8 imgre = img.resize((width,height)) return imgre loaded = "" steps = 50 def update_model_choices(provider): provider_models = { "Hugging Face": [ "Qwen/Qwen2.5-72B-Instruct", "meta-llama/Meta-Llama-3.1-70B-Instruct", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.3" ], "SambaNova": [ "Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.1-405B-Instruct", "Meta-Llama-3.1-8B-Instruct" ], } models = provider_models.get(provider, []) return gr.Dropdown(choices=models, value=models[0] if models else "") prompt_types = ["Long", "Short", "Medium", "OnlyObjects", "NoFigure", "Landscape", "Fantasy"] title = "Chat tiếng việt by tuphamkts" description = "Muốn vẽ nói: Chuyển sang chế độ vẽ. Muốn chat nói: Chuyển sang chế độ nói. Chế độ gõ: Tôi muốn dùng bàn phím, chế độ nói: Tôi muốn nói chuyện. Ghi chú: Chỉ dừng chương trình khi tôi đang nói (lịch sử chat sẽ bị xóa khi dừng chương trình)." examples = ["Chuyển sang chế độ vẽ","Chuyển sang chế độ nói"] with gr.Blocks(title=title) as demo: gr.HTML(f"
{description}