Spaces:

dhanyabahadur
/

voice-chat-with-llm

Sleeping

App Files Files Community

voice-chat-with-llm / app.py

dhanyabahadur

changes made in app.py

140467c 4 months ago

raw

history blame

7.51 kB

	import os
	os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
	os.system('python -m unidic download')
	os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')

	# Third-party library imports
	from faster_whisper import WhisperModel
	import gradio as gr
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	from TTS.tts.configs.xtts_config import XttsConfig
	from TTS.tts.models.xtts import Xtts
	from TTS.utils.generic_utils import get_user_data_dir
	from TTS.utils.manage import ModelManager

	# Local imports
	from utils import get_sentence, generate_speech_for_sentence, wave_header_chunk

	# Load whisper model
	print("Loading Whisper ASR")
	whisper_model = WhisperModel("large-v3", device="cuda", compute_type="float16")

	# Load Mistral LLM
	print("Loading Mistral LLM")
	hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
	mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
	mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)


	# Load XTTS Model
	print("Loading XTTS model")
	os.environ["COQUI_TOS_AGREED"] = "1"
	tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
	ModelManager().download_model(tts_model_name)
	tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
	config = XttsConfig()
	config.load_json(os.path.join(tts_model_path, "config.json"))
	xtts_model = Xtts.init_from_config(config)
	xtts_model.load_checkpoint(
	config,
	checkpoint_path=os.path.join(tts_model_path, "model.pth"),
	vocab_path=os.path.join(tts_model_path, "vocab.json"),
	eval=True,
	use_deepspeed=True,
	)
	xtts_model.cuda()

	###### Set up Gradio Interface ######

	with gr.Blocks(title="Voice chat with LLM") as demo:
	DESCRIPTION = """# Voice chat with LLM"""
	gr.Markdown(DESCRIPTION)

	# Define chatbot component
	chatbot = gr.Chatbot(
	value=[(None, "Hi friend, I'm Amy, an AI receptionist at dental clinic. How can I help you today?")], # Initial greeting from the chatbot
	elem_id="chatbot",
	avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
	bubble_full_width=False,
	)

	# Define chatbot voice component
	VOICES = ["female", "male"]
	with gr.Row():
	chatbot_voice = gr.Dropdown(
	label="Voice of the Chatbot",
	info="How should Chatbot talk like",
	choices=VOICES,
	max_choices=1,
	value=VOICES[0],
	)

	# Define text and audio record input components
	with gr.Row():
	txt_box = gr.Textbox(
	scale=3,
	show_label=False,
	placeholder="Enter text and press enter, or speak to your microphone",
	container=False,
	interactive=True,
	)
	audio_record = gr.Audio(source="microphone", type="filepath", scale=4)

	# Define generated audio playback component
	with gr.Row():
	sentence = gr.Textbox(visible=False)
	audio_playback = gr.Audio(
	value=None,
	label="Generated audio response",
	streaming=True,
	autoplay=True,
	interactive=False,
	show_label=True,
	)

	# Will be triggered on text submit (will send to generate_speech)
	def add_text(chatbot_history, text):
	chatbot_history = [] if chatbot_history is None else chatbot_history
	chatbot_history = chatbot_history + [(text, None)]
	return chatbot_history, gr.update(value="", interactive=False)

	# Will be triggered on voice submit (will transribe and send to generate_speech)
	def add_audio(chatbot_history, audio):
	chatbot_history = [] if chatbot_history is None else chatbot_history
	# get result from whisper and strip it to delete begin and end space
	response, _ = whisper_model.transcribe(audio)
	text = list(response)[0].text.strip()
	print("Transcribed text:", text)
	chatbot_history = chatbot_history + [(text, None)]
	return chatbot_history, gr.update(value="", interactive=False)

	def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
	# Start by yielding an initial empty audio to set up autoplay
	yield ("", chatbot_history, wave_header_chunk())

	# Helper function to handle the speech generation and yielding process
	def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
	if sentence != "":
	print("Processing sentence")
	generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=config.languages, return_as_byte=True)
	if generated_speech is not None:
	_, audio_dict = generated_speech
	yield (sentence, chatbot_history, audio_dict["value"])

	if initial_greeting:
	# Process only the initial greeting if specified
	for _, sentence in chatbot_history:
	yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
	else:
	# Continuously get and process sentences from a generator function
	for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
	print("Inserting sentence to queue")
	yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)

	txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
	).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])

	txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)

	audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
	).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])

	audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)

	FOOTNOTE = """
	This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
	It relies on the following models :
	- Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text.
	- Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses.
	- Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot.
	Note:
	- Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
	- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
	gr.Markdown(FOOTNOTE)
	demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
	demo.queue().launch(debug=True,share=True)