Spaces:
Runtime error
Runtime error
File size: 10,994 Bytes
6fa82d9 94648ab a1b2bae 13c0603 bc0e3c7 6fa82d9 94648ab 6fa82d9 b923173 6fa82d9 94648ab 281ff2d 6ce11f9 94648ab 7e16289 281ff2d 94648ab 4979540 281ff2d 94648ab 6ce11f9 4979540 97c030c 281ff2d 6fa82d9 f60cc77 6fa82d9 bc0e3c7 6fa82d9 38f8478 6fa82d9 94648ab 4979540 94648ab 6fa82d9 38f8478 6fa82d9 38f8478 6fa82d9 38f8478 6fa82d9 38f8478 6fa82d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
'''
+----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
| Step 1: Set Up | | Step 2: Set Up Gradio | | Step 3: Speech-to-Text | | Step 4: Text-to-Speech |
| Environment | | Interface | | & Language Model Processing | | Output |
+----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
| | | | | | | |
| - Import Python | | - Define interface | | - Transcribe audio | | - XTTS model generates |
| libraries | | components | | to text using | | spoken response from |
| - Initialize models: |--------> - Configure audio and |------->| Faster Whisper ASR |------->| LLM's text response |
| Whisper, Mistral, | | text interaction | | - Transcribed text | | |
| XTTS | | - Launch interface | | is added to | | |
| | | | | chatbot's history | | |
| | | | | - Mistral LLM | | |
| | | | | processes chatbot | | |
| | | | | history to generate | | |
| | | | | response | | |
+----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
'''
###### Set Up Environment ######
import os
# Set CUDA environment variable and install llama-cpp-python
# llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++
os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
os.system('python -m unidic download')
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')
# Third-party library imports
from faster_whisper import WhisperModel
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from TTS.api import TTS
from TTS.utils.manage import ModelManager
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
#from TTS.utils.manage import ModelManager
# Local imports
from utils import get_sentence, wave_header_chunk, generate_speech_for_sentence
# Load Whisper ASR model
print("Loading Whisper ASR")
whisper_model = WhisperModel("large-v3", device="cpu", compute_type="float32")
# Load Mistral LLM
print("Loading Mistral LLM")
hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)
# Load XTTS Model
print("Loading XTTS model")
#model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # move in v2, since xtts_v1 is generated keyerror, I guess you can select it with old github's release.
os.environ["COQUI_TOS_AGREED"] = "1"
#m = ModelManager().download_model(model_name)
##print(m)
#m = model_name
xtts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
#xtts_model = TTS(model_name, gpu=False)
#xtts_model.to("cpu") # no GPU or Amd
#tts.to("cuda") # cuda only
#tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
#ModelManager().download_model(tts_model_name)
#tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
#config = XttsConfig()
#config.load_json(os.path.join(tts_model_path, "config.json"))
#xtts_model = Xtts.init_from_config(config)
#xtts_model.to("cpu")
#xtts_model.load_checkpoint(
# config,
# checkpoint_path=os.path.join(tts_model_path, "model.pth"),
# vocab_path=os.path.join(tts_model_path, "vocab.json"),
# eval=True,
# use_deepspeed=True,
#)
#xtts_model.cuda()
print("Loaded XTTS model")
###### Set up Gradio Interface ######
with gr.Blocks(title="Voice chat with LLM") as demo:
DESCRIPTION = """# Voice chat with LLM"""
gr.Markdown(DESCRIPTION)
# Define chatbot component
chatbot = gr.Chatbot(
value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")], # Initial greeting from the chatbot
elem_id="chatbot",
avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
bubble_full_width=False,
)
# Define chatbot voice component
VOICES = ["female", "male"]
with gr.Row():
chatbot_voice = gr.Dropdown(
label="Voice of the Chatbot",
info="How should Chatbot talk like",
choices=VOICES,
max_choices=1,
value=VOICES[0],
)
# Define text and audio record input components
with gr.Row():
txt_box = gr.Textbox(
scale=3,
show_label=False,
placeholder="Enter text and press enter, or speak to your microphone",
container=False,
interactive=True,
)
audio_record = gr.Audio(sources=["microphone"], type="filepath", scale=4)
# Define generated audio playback component
with gr.Row():
sentence = gr.Textbox(visible=False)
audio_playback = gr.Audio(
value=None,
label="Generated audio response",
streaming=True,
autoplay=True,interactive=False,
show_label=True,
)
# Will be triggered on text submit (will send to generate_speech)
def add_text(chatbot_history, text):
chatbot_history = [] if chatbot_history is None else chatbot_history
chatbot_history = chatbot_history + [(text, None)]
return chatbot_history, gr.update(value="", interactive=False)
# Will be triggered on voice submit (will transribe and send to generate_speech)
def add_audio(chatbot_history, audio):
chatbot_history = [] if chatbot_history is None else chatbot_history
# get result from whisper and strip it to delete begin and end space
response, _ = whisper_model.transcribe(audio)
text = list(response)[0].text.strip()
print("Transcribed text:", text)
chatbot_history = chatbot_history + [(text, None)]
return chatbot_history, gr.update(value="", interactive=False)
def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
# Start by yielding an initial empty audio to set up autoplay
yield ("", chatbot_history, wave_header_chunk())
#yield ("", chatbot_history)
# Helper function to handle the speech generation and yielding process
def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
if sentence != "":
print("Processing sentence")
# generate speech by cloning a voice using default settings
xtts_model.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
file_path="output.wav",
speaker_wav=[f"examples/{chatbot_voice}.wav"],
language="en",
split_sentences=True
)
yield (sentence, chatbot_history, wave_header_chunk())
#generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, None, return_as_byte=True)
#if generated_speech is not None:
# _, audio_dict = generated_speech
# yield (sentence, chatbot_history, audio_dict["value"])
if initial_greeting:
# Process only the initial greeting if specified
for _, sentence in chatbot_history:
yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
else:
# Continuously get and process sentences from a generator function
for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
print("Inserting sentence to queue")
yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)
audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)
FOOTNOTE = """
This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
It relies on the following models :
- Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text.
- Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses.
- Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot.
Note:
- Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
gr.Markdown(FOOTNOTE)
demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
demo.queue().launch(debug=True,share=True) |