Spaces:

projecte-aina
/

transcripcio-fonetica-catala

Running

File size: 3,404 Bytes

7ae4987
7f0cc16
 
 
 
 
 
 
 
 
 
 
 
 
 
7514dcc
 
 
 
 
 
 
 
41b7aed
7514dcc
 
 
 
b8920a0
 
 
 
 
 
 
 
 
 
 
b79ce49
51e62ea
b79ce49
b8920a0
7514dcc
aa00ca9
7514dcc
b8920a0
7514dcc
b79ce49
 
7f0cc16
 
 
 
 
 
2c3e79f
 
 
 
7f0cc16
f47653c
7f0cc16
 
f47653c
7f0cc16
f47653c
 
 
 
 
 
 
 
 
 
 
 
 
 
7f0cc16
 
 
 
 
 
 
 
 
 
 
 
 
 
2c3e79f
7f0cc16
 
 
f47653c
2c3e79f
 
f47653c
 
 
7f0cc16
 
 
 
 
 
 
 
4e5413c

from engine import Piper
import tempfile
from typing import Optional
from TTS.config import load_config
import gradio as gr
import numpy as np
import os
import json
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

MAX_TXT_LEN = 100

SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']

def carrega_bsc():
    model_path = os.getcwd() + "/models/bsc/best_model.pth"
    config_path = os.getcwd() + "/models/bsc/config.json"
    speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
    vocoder_path = None
    vocoder_config_path = None

    synthesizer = Synthesizer(
        model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
    )

    return synthesizer

def carrega_collectivat():
    model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
    config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
    vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
    vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
    synthesizer = Synthesizer(
        model_path, config_path, None, None, vocoder_path, vocoder_config_path
    )

    return synthesizer

def carrega_piper():
    return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")


model_bsc = carrega_bsc()
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names

model_collectivat = carrega_collectivat()

model_piper = carrega_piper()

def tts(text, speaker_idx):
    if len(text) > MAX_TXT_LEN:
        text = text[:MAX_TXT_LEN]
        print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
    print(text)

    # synthesize
    wav_bsc = model_bsc.tts(text, speaker_idx)
    wav_coll = model_collectivat.tts(text)
    wav_piper = model_piper.synthesize(text)

    #return (model_bsc.tts_config.audio["sample_rate"], wav_bsc), (22000, wav_coll), (16000, wav_piper)

    # return output
    fp_bsc = ""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        model_bsc.save_wav(wav_bsc, fp)
        fp_bsc = fp.name

    fp_coll = ""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        model_collectivat.save_wav(wav_coll, fp)
        fp_coll = fp.name

    fp_piper = ""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        fp.write(wav_piper)
        fp_piper = fp.name

    return fp_bsc, fp_coll, fp_piper


description="""
1️⃣ Introdueix el text a sintetitzar.

2️⃣ Selecciona una veu en el desplegable.

3️⃣ Gaudeix!
"""
article= ""

iface = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(
            label="Text",
            default="L'Èlia i l'Alí a l'aula.  L'oli i l'ou.  Lulú olorava la lila.",
        ),
        gr.Dropdown(label="Selecciona un parlant", choices=SPEAKERS, default="ona")
    ],
    outputs=[
        gr.Audio(label="BSC VITS",type="filepath"),
        gr.Audio(label="Collectivat Fastspeech",type="filepath"),
        gr.Audio(label="Piper VITS",type="filepath")
    ],
    title="🗣️ TTS Català Multi Parlant - VITS 🗣️",
    description=description,
    article=article,
    allow_flagging="never",
    layout="vertical",
    live=False
)
iface.launch(server_name="0.0.0.0", server_port=7860)