import os import torch import argparse import gradio as gr import openai from zipfile import ZipFile import requests import se_extractor from api import BaseSpeakerTTS, ToneColorConverter import langid import traceback from dotenv import load_dotenv # Load environment variables load_dotenv() # Global variables for preloaded resources en_base_speaker_tts = None zh_base_speaker_tts = None tone_color_converter = None target_se = None device = 'cuda' if torch.cuda.is_available() else 'cpu' # Function to download and extract checkpoints def download_and_extract_checkpoints(): zip_url = "https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip" zip_path = "checkpoints.zip" if not os.path.exists("checkpoints"): print("Downloading checkpoints...") response = requests.get(zip_url, stream=True) with open(zip_path, "wb") as zip_file: for chunk in response.iter_content(chunk_size=8192): if chunk: zip_file.write(chunk) print("Extracting checkpoints...") with ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(".") os.remove(zip_path) print("Checkpoints are ready.") # Initialize models and resources def initialize_resources(): global en_base_speaker_tts, zh_base_speaker_tts, tone_color_converter, target_se print("Initializing resources...") # Download and extract checkpoints download_and_extract_checkpoints() # Define paths to checkpoints en_ckpt_base = 'checkpoints/base_speakers/EN' zh_ckpt_base = 'checkpoints/base_speakers/ZH' ckpt_converter = 'checkpoints/converter' # Load TTS models en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device) en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth') zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device) zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth') # Load tone color converter tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth') # Load speaker embeddings en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device) zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device) # Extract speaker embedding from the default Mickey Mouse audio default_speaker_audio = "resources/output.wav" try: target_se, _ = se_extractor.get_se( default_speaker_audio, tone_color_converter, target_dir='processed', vad=True ) print("Speaker embedding extracted successfully.") except Exception as e: raise RuntimeError(f"Failed to extract speaker embedding from {default_speaker_audio}: {str(e)}") initialize_resources() # Supported languages supported_languages = ['zh', 'en'] # Predict function def predict(audio_file_pth, agree): text_hint = '' synthesized_audio_path = None # Agree with the terms if not agree: text_hint += '[ERROR] Please accept the Terms & Conditions!\n' return (text_hint, None) # Check if audio file is provided if audio_file_pth is not None: speaker_wav = audio_file_pth else: text_hint += "[ERROR] Please provide an audio file.\n" return (text_hint, None) # Transcribe audio to text using OpenAI Whisper try: with open(speaker_wav, 'rb') as audio_file: transcription_response = openai.audio.transcriptions.create( model="whisper-1", file=audio_file, response_format='text' ) input_text = transcription_response.strip() print(f"Transcribed Text: {input_text}") except Exception as e: text_hint += f"[ERROR] Transcription failed: {str(e)}\n" return (text_hint, None) if len(input_text) == 0: text_hint += "[ERROR] No speech detected in the audio.\n" return (text_hint, None) # Detect language language_predicted = langid.classify(input_text)[0].strip() print(f"Detected language: {language_predicted}") if language_predicted not in supported_languages: text_hint += f"[ERROR] Unsupported language: {language_predicted}\n" return (text_hint, None) # Select TTS model tts_model = zh_base_speaker_tts if language_predicted == "zh" else en_base_speaker_tts language = 'Chinese' if language_predicted == "zh" else 'English' # Generate response using OpenAI GPT-4 try: response = openai.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are Mickey Mouse, a cheerful character who responds to children's queries."}, {"role": "user", "content": input_text} ] ) reply_text = response['choices'][0]['message']['content'].strip() print(f"GPT-4 Reply: {reply_text}") except Exception as e: text_hint += f"[ERROR] GPT-4 response failed: {str(e)}\n" return (text_hint, None) # Synthesize reply text to audio try: src_path = os.path.join(output_dir, 'tmp_reply.wav') tts_model.tts(reply_text, src_path, speaker='default', language=language) save_path = os.path.join(output_dir, 'output_reply.wav') tone_color_converter.convert( audio_src_path=src_path, src_se=target_se, tgt_se=target_se, output_path=save_path ) text_hint += "Response generated successfully.\n" synthesized_audio_path = save_path except Exception as e: text_hint += f"[ERROR] Synthesis failed: {str(e)}\n" traceback.print_exc() return (text_hint, None) return (text_hint, synthesized_audio_path) # Gradio UI with gr.Blocks(analytics_enabled=False) as demo: gr.Markdown("# Mickey Mouse Voice Assistant") with gr.Row(): with gr.Column(): audio_input = gr.Audio(source="microphone", type="filepath", label="Record Your Voice") tos_checkbox = gr.Checkbox(label="Agree to Terms & Conditions", value=False) submit_button = gr.Button("Send") with gr.Column(): info_output = gr.Textbox(label="Info", interactive=False, lines=4) audio_output = gr.Audio(label="Mickey's Response", interactive=False, autoplay=True) submit_button.click(predict, inputs=[audio_input, tos_checkbox], outputs=[info_output, audio_output]) demo.queue() demo.launch( server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), debug=True, show_api=True, share=False )