|
import gradio as gr |
|
import torch |
|
from outetts.v0_1.interface import InterfaceHF |
|
import soundfile as sf |
|
import tempfile |
|
|
|
def initialize_model(): |
|
"""Initialize the OuteTTS model""" |
|
interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M") |
|
return interface |
|
|
|
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1): |
|
"""Process the audio file and generate speech with the cloned voice""" |
|
try: |
|
|
|
interface = initialize_model() |
|
|
|
|
|
speaker = interface.create_speaker( |
|
audio_path, |
|
reference_text |
|
) |
|
|
|
|
|
output = interface.generate( |
|
text=text_to_speak, |
|
speaker=speaker, |
|
temperature=temperature, |
|
repetition_penalty=repetition_penalty, |
|
max_lenght=4096 |
|
) |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") |
|
output.save(temp_file.name) |
|
return temp_file.name, "Voice cloning successful!" |
|
|
|
except Exception as e: |
|
return None, f"Error: {str(e)}" |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(title="Voice Cloning with OuteTTS") as app: |
|
gr.Markdown("# ποΈ Voice Cloning with OuteTTS") |
|
gr.Markdown(""" |
|
This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio, |
|
and enter the new text you want to be spoken in the cloned voice. |
|
|
|
Note: For best results, use clear audio with minimal background noise. |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
audio_input = gr.Audio(label="Upload Reference Audio", type="filepath") |
|
reference_text = gr.Textbox(label="Reference Text (what is being said in the audio)") |
|
text_to_speak = gr.Textbox(label="Text to Speak (what you want the cloned voice to say)") |
|
|
|
with gr.Row(): |
|
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, |
|
label="Temperature (higher = more variation)") |
|
repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, |
|
label="Repetition Penalty") |
|
|
|
|
|
submit_btn = gr.Button("Generate Voice", variant="primary") |
|
|
|
with gr.Column(): |
|
|
|
output_audio = gr.Audio(label="Generated Speech") |
|
output_message = gr.Textbox(label="Status") |
|
|
|
|
|
submit_btn.click( |
|
fn=process_audio_file, |
|
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty], |
|
outputs=[output_audio, output_message] |
|
) |
|
|
|
gr.Markdown(""" |
|
### Tips for best results: |
|
1. Use high-quality reference audio (clear speech, minimal background noise) |
|
2. Ensure reference text matches the audio exactly |
|
3. Keep generated text relatively short for better quality |
|
4. Adjust temperature and repetition penalty if needed: |
|
- Lower temperature (0.1-0.3) for more consistent output |
|
- Higher repetition penalty (1.1-1.3) to avoid repetition |
|
""") |
|
|
|
return app |
|
|
|
|
|
if __name__ == "__main__": |
|
app = create_interface() |
|
app.launch(share=True) |
|
|