import os import time import torch import urllib.request import gradio as gr import numpy as np import soundfile as sf from espnet2.bin.tts_inference import Text2Speech from espnet2.utils.types import str_or_none from pathlib import Path gos_text2speech = Text2Speech.from_pretrained( model_tag="bartelds/gos_tts", device="cpu", speed_control_alpha=1.0, noise_scale=1.0, noise_scale_dur=1.0 ) def inference(text, lang): with torch.no_grad(): lines = text.splitlines() outputs = [] for line in lines: line = line.lower() if lang == "Hoogelaandsters": wav = gos_text2speech(line, sids=np.array([1]))["wav"] elif lang == "Oldambsters": wav = gos_text2speech(line, sids=np.array([2]))["wav"] elif lang == "Westerkertaaiers": wav = gos_text2speech(line, sids=np.array([3]))["wav"] outputs.append(wav) concatenated_wav = np.concatenate([o.view(-1).cpu().numpy() for o in outputs]) sf.write("out.wav", concatenated_wav, gos_text2speech.fs) return "out.wav", "out.wav" title = "Gronings text-to-speech" examples = [ ['Mamme mos even noar winkel om n bosschop.', 'Hoogelaandsters'] ] gr.Interface( inference, [gr.inputs.Textbox(label="Input text", lines=3), gr.inputs.Radio(choices=["Hoogelaandsters", "Oldambsters", "Westerkertaaiers"], type="value", default="Hoogelaandsters", label="Variant")], [gr.outputs.Audio(type="file", label="Output"), gr.outputs.File()], title=title, examples=examples ).launch(enable_queue=True)