|
import gradio as gr |
|
import torch |
|
from outetts.v0_1.interface import InterfaceHF |
|
import soundfile as sf |
|
import tempfile |
|
import os |
|
from faster_whisper import WhisperModel |
|
|
|
def initialize_models(): |
|
"""Initialize the OuteTTS and Faster-Whisper models""" |
|
tts_interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M") |
|
|
|
asr_model = WhisperModel("tiny", |
|
device="cpu", |
|
compute_type="int8", |
|
num_workers=1, |
|
cpu_threads=1) |
|
return tts_interface, asr_model |
|
|
|
|
|
TTS_INTERFACE, ASR_MODEL = initialize_models() |
|
|
|
def transcribe_audio(audio_path): |
|
"""Transcribe audio using Faster-Whisper tiny""" |
|
try: |
|
|
|
segments, _ = ASR_MODEL.transcribe(audio_path, |
|
beam_size=1, |
|
best_of=1, |
|
temperature=1.0, |
|
condition_on_previous_text=False, |
|
compression_ratio_threshold=2.4, |
|
log_prob_threshold=-1.0, |
|
no_speech_threshold=0.6) |
|
|
|
|
|
text = " ".join([segment.text for segment in segments]).strip() |
|
return text |
|
except Exception as e: |
|
return f"Error transcribing audio: {str(e)}" |
|
|
|
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1): |
|
"""Process the audio file and generate speech with the cloned voice""" |
|
try: |
|
|
|
if not reference_text.strip(): |
|
reference_text = transcribe_audio(audio_path) |
|
if reference_text.startswith("Error"): |
|
return None, reference_text |
|
|
|
|
|
speaker = TTS_INTERFACE.create_speaker( |
|
audio_path, |
|
reference_text |
|
) |
|
|
|
|
|
output = TTS_INTERFACE.generate( |
|
text=text_to_speak, |
|
speaker=speaker, |
|
temperature=temperature, |
|
repetition_penalty=repetition_penalty, |
|
max_lenght=4096 |
|
) |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
output.save(temp_file.name) |
|
return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}" |
|
|
|
except Exception as e: |
|
return None, f"Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="Voice Cloning with OuteTTS") as demo: |
|
gr.Markdown("# ποΈ Voice Cloning with OuteTTS") |
|
gr.Markdown(""" |
|
This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription), |
|
and enter the new text you want to be spoken in the cloned voice. |
|
|
|
Note: For best results, use clear audio with minimal background noise. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
audio_input = gr.Audio(label="Upload Reference Audio", type="filepath") |
|
reference_text = gr.Textbox( |
|
label="Reference Text (what is being said in the audio, leave blank for auto-transcription)", |
|
placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio" |
|
) |
|
text_to_speak = gr.Textbox( |
|
label="Text to Speak (what you want the cloned voice to say)", |
|
placeholder="Enter the text you want the cloned voice to speak" |
|
) |
|
|
|
with gr.Row(): |
|
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, |
|
label="Temperature (higher = more variation)") |
|
repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, |
|
label="Repetition Penalty") |
|
|
|
|
|
submit_btn = gr.Button("Generate Voice", variant="primary") |
|
|
|
with gr.Column(): |
|
|
|
output_audio = gr.Audio(label="Generated Speech") |
|
output_message = gr.Textbox(label="Status", max_lines=3) |
|
|
|
|
|
submit_btn.click( |
|
fn=process_audio_file, |
|
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty], |
|
outputs=[output_audio, output_message] |
|
) |
|
|
|
gr.Markdown(""" |
|
### Tips for best results: |
|
1. Use high-quality reference audio (clear speech, minimal background noise) |
|
2. If providing reference text manually, ensure it matches the audio exactly |
|
3. If using auto-transcription, verify the transcribed text in the status message |
|
4. Keep generated text relatively short for better quality |
|
5. Adjust temperature and repetition penalty if needed: |
|
- Lower temperature (0.1-0.3) for more consistent output |
|
- Higher repetition penalty (1.1-1.3) to avoid repetition |
|
""") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |