|
import gradio as gr |
|
import torch |
|
import torch.nn as nn |
|
import os |
|
from outetts.v0_1.interface import InterfaceHF |
|
import soundfile as sf |
|
import tempfile |
|
from faster_whisper import WhisperModel |
|
from pathlib import Path |
|
|
|
|
|
torch.set_num_threads(4) |
|
torch.set_grad_enabled(False) |
|
|
|
class OptimizedTTSInterface: |
|
def __init__(self, model_name="OuteAI/OuteTTS-0.1-350M"): |
|
self.interface = InterfaceHF(model_name) |
|
|
|
self.interface.model = torch.quantization.quantize_dynamic( |
|
self.interface.model, {nn.Linear}, dtype=torch.qint8 |
|
) |
|
|
|
self.interface.model.cpu() |
|
self.interface.model.eval() |
|
|
|
def create_speaker(self, *args, **kwargs): |
|
with torch.inference_mode(): |
|
return self.interface.create_speaker(*args, **kwargs) |
|
|
|
def generate(self, *args, **kwargs): |
|
with torch.inference_mode(): |
|
return self.interface.generate(*args, **kwargs) |
|
|
|
def initialize_models(): |
|
"""Initialize the OptimizedTTS and Faster-Whisper models""" |
|
|
|
cache_dir = Path("model_cache") |
|
cache_dir.mkdir(exist_ok=True) |
|
|
|
tts_interface = OptimizedTTSInterface() |
|
|
|
|
|
asr_model = WhisperModel("tiny", |
|
device="cpu", |
|
compute_type="int8", |
|
num_workers=1, |
|
cpu_threads=2, |
|
download_root=str(cache_dir)) |
|
return tts_interface, asr_model |
|
|
|
def transcribe_audio(audio_path): |
|
"""Transcribe audio using Faster-Whisper tiny""" |
|
try: |
|
segments, _ = ASR_MODEL.transcribe(audio_path, |
|
beam_size=1, |
|
best_of=1, |
|
temperature=1.0, |
|
condition_on_previous_text=False, |
|
compression_ratio_threshold=2.4, |
|
log_prob_threshold=-1.0, |
|
no_speech_threshold=0.6) |
|
|
|
text = " ".join([segment.text for segment in segments]).strip() |
|
return text |
|
except Exception as e: |
|
return f"Error transcribing audio: {str(e)}" |
|
|
|
def preprocess_audio(audio_path): |
|
"""Preprocess audio to reduce memory usage""" |
|
try: |
|
|
|
data, sr = sf.read(audio_path) |
|
if sr != 16000: |
|
import resampy |
|
data = resampy.resample(data, sr, 16000) |
|
sr = 16000 |
|
|
|
|
|
if len(data.shape) > 1: |
|
data = data.mean(axis=1) |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
sf.write(temp_file.name, data, sr) |
|
return temp_file.name |
|
except Exception as e: |
|
return audio_path |
|
|
|
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1): |
|
"""Process the audio file and generate speech with the cloned voice""" |
|
try: |
|
|
|
processed_audio = preprocess_audio(audio_path) |
|
|
|
|
|
if not reference_text.strip(): |
|
reference_text = transcribe_audio(processed_audio) |
|
if reference_text.startswith("Error"): |
|
return None, reference_text |
|
|
|
|
|
speaker = TTS_INTERFACE.create_speaker( |
|
processed_audio, |
|
reference_text |
|
) |
|
|
|
|
|
output = TTS_INTERFACE.generate( |
|
text=text_to_speak, |
|
speaker=speaker, |
|
temperature=temperature, |
|
repetition_penalty=repetition_penalty, |
|
max_lenght=4096 |
|
) |
|
|
|
|
|
if processed_audio != audio_path: |
|
try: |
|
os.unlink(processed_audio) |
|
except: |
|
pass |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
output.save(temp_file.name) |
|
return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}" |
|
|
|
except Exception as e: |
|
if processed_audio != audio_path: |
|
try: |
|
os.unlink(processed_audio) |
|
except: |
|
pass |
|
return None, f"Error: {str(e)}" |
|
|
|
print("Initializing models...") |
|
|
|
TTS_INTERFACE, ASR_MODEL = initialize_models() |
|
print("Models initialized!") |
|
|
|
|
|
with gr.Blocks(title="Voice Cloning with OuteTTS") as demo: |
|
gr.Markdown("# ποΈ Optimized Voice Cloning with OuteTTS") |
|
gr.Markdown(""" |
|
This app uses optimized versions of OuteTTS and Whisper for efficient voice cloning on CPU. |
|
Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription), |
|
and enter the new text you want to be spoken in the cloned voice. |
|
|
|
Note: For best results, use clear audio with minimal background noise. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio( |
|
label="Upload Reference Audio", |
|
type="filepath" |
|
) |
|
reference_text = gr.Textbox( |
|
label="Reference Text (leave blank for auto-transcription)", |
|
placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio" |
|
) |
|
text_to_speak = gr.Textbox( |
|
label="Text to Speak", |
|
placeholder="Enter the text you want the cloned voice to speak" |
|
) |
|
|
|
with gr.Row(): |
|
temperature = gr.Slider( |
|
minimum=0.1, |
|
maximum=1.0, |
|
value=0.1, |
|
step=0.1, |
|
label="Temperature" |
|
) |
|
repetition_penalty = gr.Slider( |
|
minimum=1.0, |
|
maximum=2.0, |
|
value=1.1, |
|
step=0.1, |
|
label="Repetition Penalty" |
|
) |
|
|
|
submit_btn = gr.Button("Generate Voice", variant="primary") |
|
|
|
with gr.Column(): |
|
output_audio = gr.Audio(label="Generated Speech") |
|
output_message = gr.Textbox(label="Status", max_lines=3) |
|
|
|
submit_btn.click( |
|
fn=process_audio_file, |
|
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty], |
|
outputs=[output_audio, output_message] |
|
) |
|
|
|
gr.Markdown(""" |
|
### Optimization Notes: |
|
- Using INT8 quantization for efficient CPU usage |
|
- Optimized audio preprocessing |
|
- Cached model loading |
|
- Memory-efficient inference |
|
|
|
### Tips for best results: |
|
1. Use clear, high-quality reference audio |
|
2. Keep reference audio short (5-10 seconds) |
|
3. Verify auto-transcription accuracy |
|
4. For best quality, manually input exact reference text |
|
5. Keep generated text concise |
|
""") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |