tts-silero / app.py
NeuroSenko's picture
formatting fix
5d68da9
raw
history blame
4.22 kB
import os
from datetime import datetime
from inspect import signature
import gradio as gr
import torch
from omegaconf import OmegaConf
torch.hub.download_url_to_file(
"https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml",
"latest_silero_models.yml",
progress=False,
)
all_models = OmegaConf.load("latest_silero_models.yml")
language="ru"
model_id = "v3_1_ru"
device = torch.device("cpu")
model, example_text = torch.hub.load(
repo_or_dir="snakers4/silero-models",
model="silero_tts",
language=language,
speaker=model_id,
)
model.to(device) # gpu or cpu
sample_rate = 48000
speaker = "aidar"
put_accent = True
put_yo = True
example_text = "В недрах тундры выдры в г+етрах т+ырят в вёдра ядра к+едров."
models = list(all_models.tts_models.get(language).keys())
model, example_text = torch.hub.load(
repo_or_dir='snakers4/silero-models',
model='silero_tts',
language='ru',
speaker=model_id
)
def change_language(language):
models = list(all_models.tts_models.get(language).keys())
return model_input.update(choices=models)
def change_model(language, model_name):
model, example_text = torch.hub.load(
repo_or_dir='snakers4/silero-models',
model='silero_tts',
language=language,
speaker=model_name
)
return speaker_input.update(choices=model.speakers)
def generate_audio_by_text(text, text_type, speaker):
output_file_name = "{datetime}.wav".format(datetime=datetime.now().isoformat().replace(':', '-'))
output = os.path.join("out_audio", output_file_name)
if text_type == 'SSML':
return model.save_wav(
audio_path=output,
ssml_text=text,
speaker=speaker,
sample_rate=sample_rate,
put_accent=put_accent,
put_yo=put_yo,
)
else:
return model.save_wav(
audio_path=output,
text=text,
speaker=speaker,
sample_rate=sample_rate,
put_accent=put_accent,
put_yo=put_yo,
)
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
language_input = gr.Dropdown(
label="Language",
choices=list(all_models.tts_models.keys()),
value="ru",
interactive=True,
)
model_input = gr.Dropdown(
label="Model (based on selected language)",
value="v3_1_ru",
choices=models,
interactive=True,
)
speaker_input = gr.Dropdown(
label="Speaker (based on selected model)",
value="kseniya",
choices=model.speakers,
interactive=True,
)
text_input = gr.Textbox(
label="Text for generating",
value="В недрах тундры выдры в г+етрах т+ырят в вёдра +ядра к+едров.",
lines=5,
interactive=True,
)
text_type_input = gr.Radio(
label="Text type",
choices=["Common", "SSML"],
value="Common",
interactive=True,
)
language_input.change(change_language, inputs=language_input, outputs=model_input)
model_input.change(change_model, inputs=[language_input, model_input], outputs=speaker_input)
with gr.Column():
audio_output = gr.Audio(label="Output audio")
generate_btn = gr.Button(value="Generate", variant="primary")
generate_btn.click(
generate_audio_by_text,
inputs=[text_input, text_type_input, speaker_input],
outputs=audio_output,
)
gr.Markdown(
"This is a simple frontend for [silero](https://github.com/snakers4/silero-models) project (Text-To-Speech part only)."
)
gr.Markdown(
"You can check [official docs](https://github.com/snakers4/silero-models/wiki/SSML) to find information about SSML syntax."
)
demo.launch()