Spaces:

phamngoctukts
/

assistant

Runtime error

App Files Files Community

assistant / Chatweb.py

phamngoctukts

Upload 6 files

39a832e verified 3 months ago

raw

history blame

5.68 kB

	import speech_recognition as sr
	import ollama
	from gtts import gTTS
	import gradio as gr
	from io import BytesIO
	import numpy as np
	from dataclasses import dataclass, field
	import time
	import traceback
	from pydub import AudioSegment
	import librosa
	from utils.vad import get_speech_timestamps, collect_chunks, VadOptions

	r = sr.Recognizer()

	@dataclass
	class AppState:
	stream: np.ndarray \| None = None
	sampling_rate: int = 0
	pause_detected: bool = False
	started_talking: bool = False
	stopped: bool = False
	conversation: list = field(default_factory=list)

	def run_vad(ori_audio, sr):
	_st = time.time()
	try:
	audio = ori_audio
	audio = audio.astype(np.float32) / 32768.0
	sampling_rate = 16000
	if sr != sampling_rate:
	audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
	vad_parameters = {}
	vad_parameters = VadOptions(**vad_parameters)
	speech_chunks = get_speech_timestamps(audio, vad_parameters)
	audio = collect_chunks(audio, speech_chunks)
	duration_after_vad = audio.shape[0] / sampling_rate
	if sr != sampling_rate:
	# resample to original sampling rate
	vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
	else:
	vad_audio = audio
	vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
	vad_audio_bytes = vad_audio.tobytes()
	return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
	except Exception as e:
	msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
	print(msg)
	return -1, ori_audio, round(time.time() - _st, 4)

	def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
	"""Take in the stream, determine if a pause happened"""
	temp_audio = audio
	dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
	duration = len(audio) / sampling_rate
	if dur_vad > 0.5 and not state.started_talking:
	print("started talking")
	state.started_talking = True
	return False
	print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
	return (duration - dur_vad) > 1

	def process_audio(audio:tuple, state:AppState):
	if state.stream is None:
	state.stream = audio[1]
	state.sampling_rate = audio[0]
	else:
	state.stream = np.concatenate((state.stream, audio[1]))
	pause_detected = determine_pause(state.stream, state.sampling_rate, state)
	state.pause_detected = pause_detected
	if state.pause_detected and state.started_talking:
	return gr.Audio(recording=False), state
	return None, state

	def response(state:AppState):
	if not state.pause_detected and not state.started_talking:
	return None, AppState()
	audio_buffer = BytesIO()
	segment = AudioSegment(
	state.stream.tobytes(),
	frame_rate=state.sampling_rate,
	sample_width=state.stream.dtype.itemsize,
	channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
	)
	segment.export(audio_buffer, format="wav")
	textin = ""
	with sr.AudioFile(audio_buffer) as source:
	audio_data=r.record(source)
	try:
	textin=r.recognize_google(audio_data,language='vi')
	except:
	textin = ""
	state.conversation.append({"role": "user", "content": "Bạn: " + textin})
	if textin != "":
	print("Đang nghĩ...")
	response = ollama.chat(model='llama3.2', messages=[
	{
	'role': 'user',
	'content': textin,
	},
	])
	textout=response['message']['content']
	textout = textout.replace('*','')
	state.conversation.append({"role": "user", "content": "Trợ lý: " + textout})
	if textout != "":
	print("Đang đọc...")
	mp3 = gTTS(textout,tld='com.vn',lang='vi',slow=False)
	mp3_fp = BytesIO()
	mp3.write_to_fp(mp3_fp)
	srr=mp3_fp.getvalue()
	mp3_fp.close()
	#yield srr, state
	yield srr, AppState(conversation=state.conversation)

	def start_recording_user(state: AppState):
	if not state.stopped:
	return gr.Audio(recording=True)

	title = "vietnamese by tuphamkts"
	description = "A vietnamese text-to-speech demo."

	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	input_audio = gr.Audio(label="Nói cho tôi nghe nào", sources="microphone", type="numpy")
	with gr.Column():
	chatbot = gr.Chatbot(label="Nội dung trò chuyện", type="messages")
	output_audio = gr.Audio(label="Trợ lý", autoplay=True)
	state = gr.State(value=AppState())

	stream = input_audio.stream(
	process_audio,
	[input_audio, state],
	[input_audio, state],
	stream_every=0.50,
	time_limit=30,
	)
	respond = input_audio.stop_recording(
	response,
	[state],
	[output_audio, state],
	)
	respond.then(lambda s: s.conversation, [state], [chatbot])

	restart = output_audio.stop(
	start_recording_user,
	[state],
	[input_audio],
	)
	cancel = gr.Button("Stop Conversation", variant="stop")
	cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
	[state, input_audio], cancels=[respond, restart])
	demo.launch(server_name="0.0.0.0", server_port=7860, debug='false', share=True)