import gradio as gr import numpy as np import os import time import torch from scipy.io import wavfile import soundfile as sf import datasets # Bark imports from bark import generate_audio, SAMPLE_RATE from bark.generation import preload_models, generate_text_semantic # Hugging Face Transformers from transformers import ( SpeechT5HifiGan, SpeechT5ForTextToSpeech, SpeechT5Processor ) class VoiceSynthesizer: def __init__(self): # Create working directory self.base_dir = os.path.dirname(os.path.abspath(__file__)) self.working_dir = os.path.join(self.base_dir, "working_files") os.makedirs(self.working_dir, exist_ok=True) # Store reference voice self.reference_voice = None # Initialize models dictionary self.models = { "bark": self._initialize_bark, "speecht5": self._initialize_speecht5 } # Default model self.current_model = "bark" # Initialize Bark models try: print("Attempting to load Bark models...") preload_models() print("Bark models loaded successfully.") except Exception as e: print(f"Bark model loading error: {e}") def _initialize_bark(self): """Bark model initialization (already done in __init__)""" return None def _initialize_speecht5(self): """Initialize SpeechT5 model from Hugging Face""" try: # Load SpeechT5 model and processor model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load speaker embeddings embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) return { "model": model, "processor": processor, "vocoder": vocoder, "speaker_embeddings": speaker_embeddings } except Exception as e: print(f"SpeechT5 model loading error: {e}") return None def process_reference_audio(self, reference_audio): """Process and store reference audio for voice cloning""" try: # Gradio can pass audio in different formats if reference_audio is None: return "No audio provided" # Handle different input types if isinstance(reference_audio, tuple): # Gradio typically returns (sample_rate, audio_array) if len(reference_audio) == 2: sample_rate, audio_data = reference_audio else: audio_data = reference_audio[0] sample_rate = SAMPLE_RATE # Default to Bark sample rate elif isinstance(reference_audio, np.ndarray): audio_data = reference_audio sample_rate = SAMPLE_RATE else: return "Invalid audio format" # Ensure audio is numpy array audio_data = np.asarray(audio_data) # Handle multi-channel audio if audio_data.ndim > 1: audio_data = audio_data.mean(axis=1) # Trim or pad to standard length max_duration = 10 # 10 seconds max_samples = max_duration * sample_rate if len(audio_data) > max_samples: audio_data = audio_data[:max_samples] # Resample if necessary if sample_rate != SAMPLE_RATE: from scipy.signal import resample audio_data = resample(audio_data, int(len(audio_data) * SAMPLE_RATE / sample_rate)) # Save reference audio ref_filename = os.path.join(self.working_dir, "reference_voice.wav") sf.write(ref_filename, audio_data, SAMPLE_RATE) # Store reference voice self.reference_voice = ref_filename return "Reference voice processed successfully" except Exception as e: print(f"Reference audio processing error: {e}") import traceback traceback.print_exc() return f"Error processing reference audio: {str(e)}" def _generate_bark_speech(self, text, voice_preset=None): """Generate speech using Bark""" # Default Bark voice presets voice_presets = [ "v2/en_speaker_6", # Female "v2/en_speaker_3", # Male "v2/en_speaker_9", # Neutral ] # Prepare history prompt history_prompt = None # Check if a reference voice is available if self.reference_voice is not None: # Use saved reference voice file history_prompt = self.reference_voice # If no reference voice, use preset if history_prompt is None and voice_preset: # Extract the actual preset value if isinstance(voice_preset, str): # Remove any additional text in parentheses preset_value = voice_preset.split(' ')[0] history_prompt = preset_value if preset_value in voice_presets else voice_presets[0] else: history_prompt = voice_presets[0] # Generate audio with or without history prompt try: # Attempt generation with different approaches if history_prompt: try: audio_array = generate_audio( text, history_prompt=history_prompt ) except Exception as preset_error: print(f"Error with specific history prompt: {preset_error}") # Fallback to default generation audio_array = generate_audio(text) else: # Fallback to default generation audio_array = generate_audio(text) # Save generated audio filename = f"bark_speech_{int(time.time())}.wav" filepath = os.path.join(self.working_dir, filename) wavfile.write(filepath, SAMPLE_RATE, audio_array) return filepath, None except Exception as e: print(f"Bark speech generation error: {e}") import traceback traceback.print_exc() return None, f"Error in Bark speech generation: {str(e)}" def generate_speech(self, text, model_name=None, voice_preset=None): """Generate speech using selected model""" if not text or not text.strip(): return None, "Please enter some text to speak" # Use specified model or current model current_model = model_name or self.current_model try: if current_model == "bark": return self._generate_bark_speech(text, voice_preset) elif current_model == "speecht5": return self._generate_speecht5_speech(text, voice_preset) else: raise ValueError(f"Unsupported model: {current_model}") except Exception as e: print(f"Speech generation error: {e}") import traceback traceback.print_exc() return None, f"Error generating speech: {str(e)}" def _generate_speecht5_speech(self, text, speaker_id=None): """Generate speech using SpeechT5""" # Ensure model is initialized speecht5_models = self.models["speecht5"]() if not speecht5_models: return None, "SpeechT5 model not loaded" model = speecht5_models["model"] processor = speecht5_models["processor"] vocoder = speecht5_models["vocoder"] speaker_embeddings = speecht5_models["speaker_embeddings"] # Prepare inputs inputs = processor(text=text, return_tensors="pt") # Generate speech speech = model.generate_speech( inputs["input_ids"], speaker_embeddings ) # Convert to numpy array audio_array = speech.numpy() # Save generated audio filename = f"speecht5_speech_{int(time.time())}.wav" filepath = os.path.join(self.working_dir, filename) wavfile.write(filepath, 16000, audio_array) return filepath, None def create_interface(): synthesizer = VoiceSynthesizer() with gr.Blocks() as interface: gr.Markdown("# 🎙️ Advanced Voice Synthesis") with gr.Row(): with gr.Column(): gr.Markdown("## 1. Capture Reference Voice") reference_audio = gr.Audio(sources=["microphone", "upload"], type="numpy") process_ref_btn = gr.Button("Process Reference Voice") process_ref_output = gr.Textbox(label="Reference Voice Processing") with gr.Column(): gr.Markdown("## 2. Generate Speech") text_input = gr.Textbox(label="Enter Text to Speak") # Model Selection model_dropdown = gr.Dropdown( choices=[ "bark (Suno AI)", "speecht5 (Microsoft)" ], label="Select TTS Model", value="bark (Suno AI)" ) # Voice Preset Dropdowns with gr.Row(): bark_preset = gr.Dropdown( choices=[ "v2/en_speaker_6 (Female Voice)", "v2/en_speaker_3 (Male Voice)", "v2/en_speaker_9 (Neutral Voice)" ], label="Bark Voice Preset", value="v2/en_speaker_6 (Female Voice)", visible=True ) speecht5_preset = gr.Dropdown( choices=[ "Default Speaker" ], label="SpeechT5 Speaker", visible=False ) generate_btn = gr.Button("Generate Speech") audio_output = gr.Audio(label="Generated Speech") error_output = gr.Textbox(label="Errors", visible=True) # Process reference audio process_ref_btn.click( fn=synthesizer.process_reference_audio, inputs=reference_audio, outputs=process_ref_output ) # Dynamic model and preset visibility def update_model_visibility(model): if "bark" in model.lower(): return { bark_preset: gr.update(visible=True), speecht5_preset: gr.update(visible=False) } else: return { bark_preset: gr.update(visible=False), speecht5_preset: gr.update(visible=True) } model_dropdown.change( fn=update_model_visibility, inputs=model_dropdown, outputs=[bark_preset, speecht5_preset] ) # Speech generation logic def generate_speech_wrapper(text, model, bark_preset, speecht5_preset): # Map model name model_map = { "bark (Suno AI)": "bark", "speecht5 (Microsoft)": "speecht5" } # Select appropriate preset preset = bark_preset if "bark" in model else speecht5_preset # Extract preset value if it's a string with additional info if isinstance(preset, str): preset = preset.split(' ')[0] return synthesizer.generate_speech( text, model_name=model_map[model], voice_preset=preset ) generate_btn.click( fn=generate_speech_wrapper, inputs=[text_input, model_dropdown, bark_preset, speecht5_preset], outputs=[audio_output, error_output] ) return interface if __name__ == "__main__": interface = create_interface() interface.launch( share=False, debug=True, show_error=True, server_name='0.0.0.0', server_port=7860 )