Spaces:

Chillarmo
/

Voice_Cloning_with_OuteTTS

Running

App Files Files Community

Voice_Cloning_with_OuteTTS / app.py

Chillarmo

Update app.py

776e91e verified 2 months ago

raw

history blame

7.94 kB

	import gradio as gr
	import torch
	import os
	from outetts.v0_1.interface import InterfaceHF
	import soundfile as sf
	import tempfile
	from faster_whisper import WhisperModel
	from pathlib import Path

	# Configure PyTorch for CPU efficiency
	torch.set_num_threads(4) # Limit CPU threads
	torch.set_grad_enabled(False) # Disable gradient computation

	class OptimizedTTSInterface:
	def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"):
	self.interface = InterfaceHF(model_name)
	# Apply FP16 optimization where possible
	self.interface.model = self.interface.model.half().float()
	# Cache commonly used attributes
	self.tokenizer = self.interface.model.tokenizer

	def create_speaker(self, args, *kwargs):
	with torch.inference_mode():
	return self.interface.create_speaker(args, *kwargs)

	def generate(self, args, *kwargs):
	with torch.inference_mode():
	return self.interface.generate(args, *kwargs)

	def initialize_models():
	"""Initialize the OptimizedTTS and Faster-Whisper models"""
	# Create cache directory for models
	cache_dir = Path("model_cache")
	cache_dir.mkdir(exist_ok=True)

	# Set environment variables for better performance
	os.environ['OMP_NUM_THREADS'] = '4'
	os.environ['MKL_NUM_THREADS'] = '4'

	print("Loading ASR model...")
	asr_model = WhisperModel("tiny",
	device="cpu",
	compute_type="int8",
	num_workers=1,
	cpu_threads=2,
	download_root=str(cache_dir))

	print("Loading TTS model...")
	tts_interface = OptimizedTTSInterface()

	return tts_interface, asr_model

	def transcribe_audio(audio_path):
	"""Transcribe audio using Faster-Whisper tiny"""
	try:
	segments, _ = ASR_MODEL.transcribe(audio_path,
	beam_size=1,
	best_of=1,
	temperature=1.0,
	condition_on_previous_text=False,
	compression_ratio_threshold=2.4,
	log_prob_threshold=-1.0,
	no_speech_threshold=0.6)

	text = " ".join([segment.text for segment in segments]).strip()
	return text
	except Exception as e:
	return f"Error transcribing audio: {str(e)}"

	def preprocess_audio(audio_path):
	"""Preprocess audio to reduce memory usage"""
	try:
	# Load and resample audio to 16kHz if needed
	data, sr = sf.read(audio_path)
	if sr != 16000:
	import resampy
	data = resampy.resample(data, sr, 16000)
	sr = 16000

	# Convert to mono if stereo
	if len(data.shape) > 1:
	data = data.mean(axis=1)

	# Normalize audio
	data = data / max(abs(data.max()), abs(data.min()))

	# Save preprocessed audio
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
	sf.write(temp_file.name, data, sr)
	return temp_file.name
	except Exception as e:
	return audio_path # Return original if preprocessing fails

	def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1):
	"""Process the audio file and generate speech with the cloned voice"""
	try:
	# Preprocess audio
	processed_audio = preprocess_audio(audio_path)

	# If no reference text provided, transcribe the audio
	if not reference_text.strip():
	reference_text = transcribe_audio(processed_audio)
	if reference_text.startswith("Error"):
	return None, reference_text

	# Create speaker from reference audio
	with torch.inference_mode():
	speaker = TTS_INTERFACE.create_speaker(
	processed_audio,
	reference_text
	)

	# Generate speech with cloned voice
	output = TTS_INTERFACE.generate(
	text=text_to_speak,
	speaker=speaker,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	max_lenght=4096
	)

	# Clean up preprocessed audio if it was created
	if processed_audio != audio_path:
	try:
	os.unlink(processed_audio)
	except:
	pass

	# Save output to temporary file
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
	output.save(temp_file.name)
	return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}"

	except Exception as e:
	if processed_audio != audio_path:
	try:
	os.unlink(processed_audio)
	except:
	pass
	return None, f"Error: {str(e)}"

	print("Starting initialization...")
	# Initialize models globally
	TTS_INTERFACE, ASR_MODEL = initialize_models()
	print("Models initialized successfully!")

	# Create Gradio interface
	with gr.Blocks(title="Voice Cloning with OuteTTS") as demo:
	gr.Markdown("# 🎙️ Optimized Voice Cloning with OuteTTS")
	gr.Markdown("""
	This app uses optimized versions of OuteTTS and Whisper for efficient voice cloning on CPU.
	Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription),
	and enter the new text you want to be spoken in the cloned voice.

	Note: First run may take longer while models are being cached.
	""")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	label="Upload Reference Audio",
	type="filepath",
	source="microphone"
	)
	reference_text = gr.Textbox(
	label="Reference Text (leave blank for auto-transcription)",
	placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio"
	)
	text_to_speak = gr.Textbox(
	label="Text to Speak",
	placeholder="Enter the text you want the cloned voice to speak"
	)

	with gr.Row():
	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.1,
	step=0.1,
	label="Temperature"
	)
	repetition_penalty = gr.Slider(
	minimum=1.0,
	maximum=2.0,
	value=1.1,
	step=0.1,
	label="Repetition Penalty"
	)

	submit_btn = gr.Button("Generate Voice", variant="primary")

	with gr.Column():
	output_audio = gr.Audio(label="Generated Speech")
	output_message = gr.Textbox(label="Status", max_lines=3)

	submit_btn.click(
	fn=process_audio_file,
	inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty],
	outputs=[output_audio, output_message]
	)

	gr.Markdown("""
	### Optimization Notes:
	- Optimized for CPU performance
	- Model caching enabled
	- Memory-efficient inference
	- Automatic audio preprocessing

	### Tips for best results:
	1. Use clear, high-quality reference audio
	2. Keep reference audio short (5-10 seconds)
	3. Verify auto-transcription accuracy
	4. For best quality, manually input exact reference text
	5. Keep generated text concise
	""")

	if __name__ == "__main__":
	demo.launch()