File size: 3,404 Bytes
7ae4987 7f0cc16 7514dcc 41b7aed 7514dcc b8920a0 b79ce49 51e62ea b79ce49 b8920a0 7514dcc aa00ca9 7514dcc b8920a0 7514dcc b79ce49 7f0cc16 2c3e79f 7f0cc16 f47653c 7f0cc16 f47653c 7f0cc16 f47653c 7f0cc16 2c3e79f 7f0cc16 f47653c 2c3e79f f47653c 7f0cc16 4e5413c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
from engine import Piper
import tempfile
from typing import Optional
from TTS.config import load_config
import gradio as gr
import numpy as np
import os
import json
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
MAX_TXT_LEN = 100
SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']
def carrega_bsc():
model_path = os.getcwd() + "/models/bsc/best_model.pth"
config_path = os.getcwd() + "/models/bsc/config.json"
speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
vocoder_path = None
vocoder_config_path = None
synthesizer = Synthesizer(
model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
)
return synthesizer
def carrega_collectivat():
model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
synthesizer = Synthesizer(
model_path, config_path, None, None, vocoder_path, vocoder_config_path
)
return synthesizer
def carrega_piper():
return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
model_bsc = carrega_bsc()
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
model_collectivat = carrega_collectivat()
model_piper = carrega_piper()
def tts(text, speaker_idx):
if len(text) > MAX_TXT_LEN:
text = text[:MAX_TXT_LEN]
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
print(text)
# synthesize
wav_bsc = model_bsc.tts(text, speaker_idx)
wav_coll = model_collectivat.tts(text)
wav_piper = model_piper.synthesize(text)
#return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)
# return output
fp_bsc = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
model_bsc.save_wav(wav_bsc, fp)
fp_bsc = fp.name
fp_coll = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
model_collectivat.save_wav(wav_coll, fp)
fp_coll = fp.name
fp_piper = ""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
fp.write(wav_piper)
fp_piper = fp.name
return fp_bsc, fp_coll, fp_piper
description="""
1️⃣ Introdueix el text a sintetitzar.
2️⃣ Selecciona una veu en el desplegable.
3️⃣ Gaudeix!
"""
article= ""
iface = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(
label="Text",
default="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
),
gr.Dropdown(label="Selecciona un parlant", choices=SPEAKERS, default="ona")
],
outputs=[
gr.Audio(label="BSC VITS",type="filepath"),
gr.Audio(label="Collectivat Fastspeech",type="filepath"),
gr.Audio(label="Piper VITS",type="filepath")
],
title="🗣️ TTS Català Multi Parlant - VITS 🗣️",
description=description,
article=article,
allow_flagging="never",
layout="vertical",
live=False
)
iface.launch(server_name="0.0.0.0", server_port=7860)
|