|
import gradio as gr |
|
import torch |
|
from TTS.api import TTS |
|
import os |
|
import spaces |
|
import tempfile |
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
def load_tts_model(): |
|
return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) |
|
|
|
tts = load_tts_model() |
|
|
|
|
|
celebrity_voices = { |
|
"morgan": "./voices/morgan.mp3", |
|
"Scarlett Johansson": "path/to/scarlett_johansson_sample.wav", |
|
"David Attenborough": "path/to/david_attenborough_sample.wav", |
|
} |
|
@spaces.GPU(duration=120) |
|
def tts_generate(text, voice, language): |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: |
|
temp_audio_path = temp_audio.name |
|
|
|
tts.tts_to_file( |
|
text=text, |
|
speaker_wav=celebrity_voices[voice], |
|
language=language, |
|
file_path=temp_audio_path |
|
) |
|
|
|
return temp_audio_path |
|
|
|
@spaces.GPU(enable_queue=True) |
|
def clone_voice(text, audio_file, language): |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: |
|
temp_audio_path = temp_audio.name |
|
|
|
tts.tts_to_file( |
|
text=text, |
|
speaker_wav=audio_file, |
|
language=language, |
|
file_path=temp_audio_path |
|
) |
|
|
|
return temp_audio_path |
|
|
|
|
|
def talking_image_placeholder(): |
|
return "Talking Image functionality not implemented yet." |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Advanced Voice Synthesis") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("TTS"): |
|
with gr.Row(): |
|
tts_text = gr.Textbox(label="Text to speak") |
|
tts_voice = gr.Dropdown(choices=list(celebrity_voices.keys()), label="Celebrity Voice") |
|
tts_language = gr.Dropdown(["en", "es", "fr", "de", "it","ar"], label="Language", value="en") |
|
tts_generate_btn = gr.Button("Generate") |
|
tts_output = gr.Audio(label="Generated Audio") |
|
|
|
tts_generate_btn.click( |
|
tts_generate, |
|
inputs=[tts_text, tts_voice, tts_language], |
|
outputs=tts_output |
|
) |
|
|
|
with gr.TabItem("Talking Image"): |
|
gr.Markdown("Talking Image functionality coming soon!") |
|
|
|
with gr.TabItem("Clone Voice"): |
|
with gr.Row(): |
|
clone_text = gr.Textbox(label="Text to speak") |
|
clone_audio = gr.Audio(label="Voice reference audio file", type="filepath") |
|
clone_language = gr.Dropdown(["en", "es", "fr", "de", "it","ar"], label="Language", value="en") |
|
clone_generate_btn = gr.Button("Generate") |
|
clone_output = gr.Audio(label="Generated Audio") |
|
|
|
clone_generate_btn.click( |
|
clone_voice, |
|
inputs=[clone_text, clone_audio, clone_language], |
|
outputs=clone_output |
|
) |
|
|
|
js_func = """ |
|
function refresh() { |
|
const url = new URL(window.location); |
|
|
|
if (url.searchParams.get('__theme') !== 'dark') { |
|
url.searchParams.set('__theme', 'dark'); |
|
window.location.href = url.href; |
|
} |
|
} |
|
""" |
|
|
|
|
|
|
|
demo.launch() |
|
|
|
|
|
for file in os.listdir(): |
|
if file.endswith('.wav') and file.startswith('tmp'): |
|
os.remove(file) |