|
import gradio as gr |
|
import torch |
|
from outetts.v0_1.interface import InterfaceGGUF |
|
import soundfile as sf |
|
import tempfile |
|
import os |
|
from faster_whisper import WhisperModel |
|
import huggingface_hub |
|
|
|
def download_model(): |
|
"""Download the GGUF model from HuggingFace""" |
|
model_path = huggingface_hub.hf_hub_download( |
|
repo_id="OuteAI/OuteTTS-0.1-350M-GGUF", |
|
filename="outetts-0.1-350m.gguf" |
|
) |
|
return model_path |
|
|
|
def initialize_models(): |
|
"""Initialize the OuteTTS and Faster-Whisper models""" |
|
|
|
model_path = download_model() |
|
tts_interface = InterfaceGGUF(model_path) |
|
|
|
|
|
asr_model = WhisperModel("tiny", |
|
device="cpu", |
|
compute_type="int8", |
|
num_workers=1, |
|
cpu_threads=1) |
|
return tts_interface, asr_model |
|
|
|
|
|
TTS_INTERFACE, ASR_MODEL = initialize_models() |
|
|
|
def transcribe_audio(audio_path): |
|
"""Transcribe audio using Faster-Whisper tiny""" |
|
try: |
|
segments, _ = ASR_MODEL.transcribe(audio_path, |
|
beam_size=1, |
|
best_of=1, |
|
temperature=1.0, |
|
condition_on_previous_text=False, |
|
compression_ratio_threshold=2.4, |
|
log_prob_threshold=-1.0, |
|
no_speech_threshold=0.6) |
|
|
|
text = " ".join([segment.text for segment in segments]).strip() |
|
return text |
|
except Exception as e: |
|
return f"Error transcribing audio: {str(e)}" |
|
|
|
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1): |
|
"""Process the audio file and generate speech with the cloned voice""" |
|
try: |
|
|
|
if not reference_text.strip(): |
|
gr.Info("Transcribing audio...") |
|
reference_text = transcribe_audio(audio_path) |
|
if reference_text.startswith("Error"): |
|
return None, reference_text |
|
|
|
gr.Info(f"Using reference text: {reference_text}") |
|
|
|
|
|
speaker = TTS_INTERFACE.create_speaker( |
|
audio_path, |
|
reference_text[:4000] |
|
) |
|
|
|
|
|
output = TTS_INTERFACE.generate( |
|
text=text_to_speak[:500], |
|
speaker=speaker, |
|
temperature=temperature, |
|
repetition_penalty=repetition_penalty, |
|
max_lenght=2048 |
|
) |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
output.save(temp_file.name) |
|
return temp_file.name, f"""Processing complete! |
|
Reference text: {reference_text[:500]}... |
|
(Showing first 500 characters of reference text)""" |
|
|
|
except Exception as e: |
|
return None, f"Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo: |
|
gr.Markdown("# ποΈ Voice Cloning with OuteTTS (GGUF)") |
|
gr.Markdown(""" |
|
This app uses the GGUF version of OuteTTS for optimized CPU performance. Upload a reference audio file, |
|
provide the text being spoken in that audio (or leave blank for automatic transcription), |
|
and enter the new text you want to be spoken in the cloned voice. |
|
|
|
Note: |
|
- For best results, use clear audio with minimal background noise |
|
- Reference text is limited to 4000 characters |
|
- Output text is limited to 500 characters |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
audio_input = gr.Audio(label="Upload Reference Audio", type="filepath") |
|
with gr.Row(): |
|
transcribe_btn = gr.Button("π Transcribe Audio", variant="secondary") |
|
|
|
reference_text = gr.Textbox( |
|
label="Reference Text (what is being said in the audio, leave blank for auto-transcription)", |
|
placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio", |
|
lines=3 |
|
) |
|
text_to_speak = gr.Textbox( |
|
label="Text to Speak (what you want the cloned voice to say, max 500 characters)", |
|
placeholder="Enter the text you want the cloned voice to speak", |
|
lines=3, |
|
max_lines=5 |
|
) |
|
|
|
with gr.Row(): |
|
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, |
|
label="Temperature (higher = more variation)") |
|
repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, |
|
label="Repetition Penalty") |
|
|
|
|
|
submit_btn = gr.Button("ποΈ Generate Voice", variant="primary") |
|
|
|
with gr.Column(): |
|
|
|
output_audio = gr.Audio(label="Generated Speech") |
|
output_message = gr.Textbox(label="Status", lines=4) |
|
|
|
|
|
def transcribe_button(audio): |
|
if not audio: |
|
return "Please upload audio first." |
|
return transcribe_audio(audio) |
|
|
|
transcribe_btn.click( |
|
fn=transcribe_button, |
|
inputs=[audio_input], |
|
outputs=[reference_text], |
|
) |
|
|
|
|
|
submit_btn.click( |
|
fn=process_audio_file, |
|
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty], |
|
outputs=[output_audio, output_message] |
|
) |
|
|
|
gr.Markdown(""" |
|
### Tips for best results: |
|
1. Use high-quality reference audio (clear speech, minimal background noise) |
|
2. Try to keep reference audio under 30 seconds |
|
3. If auto-transcription isn't accurate, you can manually correct the text |
|
4. Keep generated text short for better quality |
|
5. Adjust temperature and repetition penalty if needed: |
|
- Lower temperature (0.1-0.3) for more consistent output |
|
- Higher repetition penalty (1.1-1.3) to avoid repetition |
|
""") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |