|
from engine import Piper |
|
import tempfile |
|
from typing import Optional |
|
from TTS.config import load_config |
|
import gradio as gr |
|
import numpy as np |
|
import os |
|
import json |
|
from TTS.utils.manage import ModelManager |
|
from TTS.utils.synthesizer import Synthesizer |
|
from espeak_phonemizer import Phonemizer |
|
|
|
MAX_TXT_LEN = 325 |
|
|
|
SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89'] |
|
|
|
fonemitzador = Phonemizer("ca") |
|
|
|
def carrega_bsc(): |
|
model_path = os.getcwd() + "/models/bsc/best_model.pth" |
|
config_path = os.getcwd() + "/models/bsc/config.json" |
|
speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth" |
|
vocoder_path = None |
|
vocoder_config_path = None |
|
|
|
synthesizer = Synthesizer( |
|
model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path, |
|
) |
|
|
|
return synthesizer |
|
|
|
def carrega_collectivat(): |
|
model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth" |
|
config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json" |
|
vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth" |
|
vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json" |
|
synthesizer = Synthesizer( |
|
model_path, config_path, None, None, vocoder_path, vocoder_config_path |
|
) |
|
|
|
return synthesizer |
|
|
|
def carrega_piper(): |
|
return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx") |
|
|
|
|
|
model_bsc = carrega_bsc() |
|
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names |
|
|
|
model_collectivat = carrega_collectivat() |
|
|
|
model_piper = carrega_piper() |
|
|
|
def tts(text, speaker_idx): |
|
if len(text) > MAX_TXT_LEN: |
|
text = text[:MAX_TXT_LEN] |
|
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.") |
|
print(text) |
|
|
|
|
|
wav_bsc = model_bsc.tts(text, speaker_idx) |
|
wav_coll = model_collectivat.tts(text) |
|
wav_piper = model_piper.synthesize(text) |
|
|
|
|
|
|
|
|
|
fp_bsc = "" |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
model_bsc.save_wav(wav_bsc, fp) |
|
fp_bsc = fp.name |
|
|
|
fp_coll = "" |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
model_collectivat.save_wav(wav_coll, fp) |
|
fp_coll = fp.name |
|
|
|
fp_piper = "" |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
fp.write(wav_piper) |
|
fp_piper = fp.name |
|
|
|
fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True) |
|
|
|
return fonemes, fp_bsc, fp_coll, fp_piper |
|
|
|
|
|
description=""" |
|
Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuronals lliures pel català. |
|
|
|
1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) |
|
https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker |
|
|
|
2. Model Fastspeech entrenat per Col·lectivat |
|
https://github.com/CollectivaT-dev/TTS-API |
|
|
|
3. Model VITS entrenat per Piper/Home Assistant |
|
https://github.com/rhasspy/piper |
|
|
|
Els dós últims models han estat entrenats amb la veu d'Ona de FestCAT. El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres. |
|
|
|
Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC |
|
https://github.com/projecte-aina/espeak-ng |
|
|
|
NOTA: El model de col·lectivat treballa amb grafemes pel que no fa servir espeak com a fonemitzador. |
|
""" |
|
article= "" |
|
|
|
iface = gr.Interface( |
|
fn=tts, |
|
inputs=[ |
|
gr.Textbox( |
|
label="Text", |
|
value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.", |
|
), |
|
gr.Dropdown(label="Selecciona un parlant pel model VITS multi-parlant del BSC", choices=SPEAKERS, value="ona") |
|
], |
|
outputs=[ |
|
gr.Markdown(label="Fonemes"), |
|
gr.Audio(label="BSC VITS",type="filepath"), |
|
gr.Audio(label="Collectivat Fastspeech",type="filepath"), |
|
gr.Audio(label="Piper VITS",type="filepath") |
|
], |
|
title="Comparativa de síntesi lliure en català️", |
|
description=description, |
|
article=article, |
|
allow_flagging="never", |
|
layout="vertical", |
|
live=False |
|
) |
|
iface.launch(server_name="0.0.0.0", server_port=7860) |
|
|