import os from datetime import datetime from inspect import signature import gradio as gr import torch from omegaconf import OmegaConf torch.hub.download_url_to_file( "https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml", "latest_silero_models.yml", progress=False, ) all_models = OmegaConf.load("latest_silero_models.yml") language="ru" model_id = "v3_1_ru" device = torch.device("cpu") model, example_text = torch.hub.load( repo_or_dir="snakers4/silero-models", model="silero_tts", language=language, speaker=model_id, ) model.to(device) # gpu or cpu sample_rate = 48000 speaker = "aidar" put_accent = True put_yo = True example_text = "В недрах тундры выдры в г+етрах т+ырят в вёдра ядра к+едров." models = list(all_models.tts_models.get(language).keys()) model, example_text = torch.hub.load( repo_or_dir='snakers4/silero-models', model='silero_tts', language='ru', speaker=model_id ) def change_language(language): models = list(all_models.tts_models.get(language).keys()) return model_input.update(choices=models) def change_model(language, model_name): model, example_text = torch.hub.load( repo_or_dir='snakers4/silero-models', model='silero_tts', language=language, speaker=model_name ) return speaker_input.update(choices=model.speakers) def generate_audio_by_text(text, text_type, speaker): output_file_name = "{datetime}.wav".format(datetime=datetime.now().isoformat().replace(':', '-')) output = os.path.join("out_audio", output_file_name) if text_type == 'SSML': return model.save_wav( audio_path=output, ssml_text=text, speaker=speaker, sample_rate=sample_rate, put_accent=put_accent, put_yo=put_yo, ) else: return model.save_wav( audio_path=output, text=text, speaker=speaker, sample_rate=sample_rate, put_accent=put_accent, put_yo=put_yo, ) with gr.Blocks() as demo: with gr.Row(): with gr.Column(): language_input = gr.Dropdown( label="Language", choices=list(all_models.tts_models.keys()), value="ru", interactive=True, ) model_input = gr.Dropdown( label="Model (based on selected language)", value="v3_1_ru", choices=models, interactive=True, ) speaker_input = gr.Dropdown( label="Speaker (based on selected model)", value="kseniya", choices=model.speakers, interactive=True, ) text_input = gr.Textbox( label="Text for generating", value="В недрах тундры выдры в г+етрах т+ырят в вёдра +ядра к+едров.", lines=5, interactive=True, ) text_type_input = gr.Radio( label="Text type", choices=["Common", "SSML"], value="Common", interactive=True, ) language_input.change(change_language, inputs=language_input, outputs=model_input) model_input.change(change_model, inputs=[language_input, model_input], outputs=speaker_input) with gr.Column(): audio_output = gr.Audio(label="Output audio") generate_btn = gr.Button(value="Generate", variant="primary") generate_btn.click( generate_audio_by_text, inputs=[text_input, text_type_input, speaker_input], outputs=audio_output, ) gr.Markdown( "This is a simple frontend for [silero](https://github.com/snakers4/silero-models) project (Text-To-Speech part only)." ) gr.Markdown( "You can check [official docs](https://github.com/snakers4/silero-models/wiki/SSML) to find information about SSML syntax." ) demo.launch()