styletts2 / app.py
jonluca's picture
move to device
fde7bda unverified
import os
import gradio as gr
import torch
from styletts2importable import compute_style, device, inference
from txtsplit import txtsplit
import numpy as np
import phonemizer
theme = gr.themes.Base(
font=[
gr.themes.GoogleFont("Libre Franklin"),
gr.themes.GoogleFont("Public Sans"),
"system-ui",
"sans-serif",
],
)
voicelist = [
"f-us-1",
"f-us-2",
"f-us-3",
"f-us-4",
"m-us-1",
"m-us-2",
"m-us-3",
"m-us-4",
]
voices = {}
global_phonemizer = phonemizer.backend.EspeakBackend(
language="en-us", preserve_punctuation=True, with_stress=True
)
# else:
for v in voicelist:
cache_path = f"voices/{v}.wav.npy"
if os.path.exists(cache_path):
voices[v] = torch.from_numpy(np.load(cache_path)).to(device)
else:
style = compute_style(f"voices/{v}.wav")
voices[v] = style
np.save(cache_path, style.cpu().numpy())
def synthesize(text, voice, lngsteps):
if text.strip() == "":
raise gr.Error("You must enter some text")
if len(text) > 50000:
raise gr.Error("Text must be <50k characters")
print("*** saying ***")
print(text)
print("*** end ***")
texts = txtsplit(text)
v = voice.lower()
audios = []
for t in texts:
audios.append(
inference(
t,
voices[v],
alpha=0.3,
beta=0.7,
diffusion_steps=lngsteps,
embedding_scale=1,
)
)
return (24000, np.concatenate(audios))
with gr.Blocks() as vctk:
with gr.Row():
with gr.Column(scale=1):
inp = gr.Textbox(
label="Text",
info="What would you like StyleTTS 2 to read? It works better on full sentences.",
interactive=True,
)
voice = gr.Dropdown(
voicelist,
label="Voice",
info="Select a default voice.",
value="m-us-2",
interactive=True,
)
multispeakersteps = gr.Slider(
minimum=3,
maximum=15,
value=3,
step=1,
label="Diffusion Steps",
info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster",
interactive=True,
)
# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
with gr.Column(scale=1):
btn = gr.Button("Synthesize", variant="primary")
audio = gr.Audio(
interactive=False,
label="Synthesized Audio",
waveform_options={"waveform_progress_color": "#3C82F6"},
)
btn.click(
synthesize,
inputs=[inp, voice, multispeakersteps],
outputs=[audio],
concurrency_limit=4,
)
with gr.Blocks(
title="StyleTTS 2", css="footer{display:none !important}", theme=theme
) as demo:
gr.TabbedInterface(
[vctk], ["Multi-Voice", "Voice Cloning", "LJSpeech", "Long Text [Beta]"]
)
if __name__ == "__main__":
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
print("Launching")
# start_time = time.time()
# synthesize(
# "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
# "m-us-2",
# 3,
# )
# print(f"Launched in {time.time() - start_time} seconds")
# second_start_time = time.time()
# synthesize(
# "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
# "m-us-2",
# 3,
# )
# print(f"Launched in {time.time() - second_start_time} seconds")
demo.queue(api_open=True, max_size=None).launch(show_api=False)
print("Launched")