styletts2

Sleeping

App Files Files Community

styletts2 / app.py

jonluca

move to device

fde7bda unverified about 1 year ago

raw

history blame contribute delete

4.62 kB

	import os

	import gradio as gr
	import torch
	from styletts2importable import compute_style, device, inference
	from txtsplit import txtsplit
	import numpy as np
	import phonemizer


	theme = gr.themes.Base(
	font=[
	gr.themes.GoogleFont("Libre Franklin"),
	gr.themes.GoogleFont("Public Sans"),
	"system-ui",
	"sans-serif",
	],
	)
	voicelist = [
	"f-us-1",
	"f-us-2",
	"f-us-3",
	"f-us-4",
	"m-us-1",
	"m-us-2",
	"m-us-3",
	"m-us-4",
	]
	voices = {}

	global_phonemizer = phonemizer.backend.EspeakBackend(
	language="en-us", preserve_punctuation=True, with_stress=True
	)
	# else:
	for v in voicelist:
	cache_path = f"voices/{v}.wav.npy"
	if os.path.exists(cache_path):
	voices[v] = torch.from_numpy(np.load(cache_path)).to(device)
	else:
	style = compute_style(f"voices/{v}.wav")
	voices[v] = style
	np.save(cache_path, style.cpu().numpy())


	def synthesize(text, voice, lngsteps):
	if text.strip() == "":
	raise gr.Error("You must enter some text")
	if len(text) > 50000:
	raise gr.Error("Text must be <50k characters")
	print("* saying *")
	print(text)
	print("* end *")
	texts = txtsplit(text)
	v = voice.lower()
	audios = []
	for t in texts:
	audios.append(
	inference(
	t,
	voices[v],
	alpha=0.3,
	beta=0.7,
	diffusion_steps=lngsteps,
	embedding_scale=1,
	)
	)
	return (24000, np.concatenate(audios))


	with gr.Blocks() as vctk:
	with gr.Row():
	with gr.Column(scale=1):
	inp = gr.Textbox(
	label="Text",
	info="What would you like StyleTTS 2 to read? It works better on full sentences.",
	interactive=True,
	)
	voice = gr.Dropdown(
	voicelist,
	label="Voice",
	info="Select a default voice.",
	value="m-us-2",
	interactive=True,
	)
	multispeakersteps = gr.Slider(
	minimum=3,
	maximum=15,
	value=3,
	step=1,
	label="Diffusion Steps",
	info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster",
	interactive=True,
	)
	# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
	with gr.Column(scale=1):
	btn = gr.Button("Synthesize", variant="primary")
	audio = gr.Audio(
	interactive=False,
	label="Synthesized Audio",
	waveform_options={"waveform_progress_color": "#3C82F6"},
	)
	btn.click(
	synthesize,
	inputs=[inp, voice, multispeakersteps],
	outputs=[audio],
	concurrency_limit=4,
	)

	with gr.Blocks(
	title="StyleTTS 2", css="footer{display:none !important}", theme=theme
	) as demo:
	gr.TabbedInterface(
	[vctk], ["Multi-Voice", "Voice Cloning", "LJSpeech", "Long Text [Beta]"]
	)
	if __name__ == "__main__":
	# demo.queue(api_open=False, max_size=15).launch(show_api=False)
	print("Launching")
	# start_time = time.time()
	# synthesize(
	# "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
	# "m-us-2",
	# 3,
	# )
	# print(f"Launched in {time.time() - start_time} seconds")
	# second_start_time = time.time()
	# synthesize(
	# "defines how the endpoint appears in the API docs. Can be a string, None, or False. If set to a string, the endpoint will be exposed in the API docs with the given name. If None (default), the name of the function will be used as the API endpoint. If False, the endpoint will not be exposed in the API docs and downstream apps (including those that gr.load this app) will not be able to use this event.",
	# "m-us-2",
	# 3,
	# )
	# print(f"Launched in {time.time() - second_start_time} seconds")
	demo.queue(api_open=True, max_size=None).launch(show_api=False)
	print("Launched")