File size: 2,759 Bytes
7ae4987
7f0cc16
 
 
 
 
 
 
 
 
 
 
 
 
 
7514dcc
 
 
 
 
 
 
 
41b7aed
7514dcc
 
 
 
b8920a0
 
 
 
 
 
 
 
 
 
 
 
7514dcc
 
 
b8920a0
7514dcc
7f0cc16
 
 
 
 
 
 
 
 
 
 
 
7514dcc
7f0cc16
 
7514dcc
7f0cc16
 
7514dcc
7f0cc16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from engine import Piper
import tempfile
from typing import Optional
from TTS.config import load_config
import gradio as gr
import numpy as np
import os
import json
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

MAX_TXT_LEN = 100

SPEAKERS = ['f_cen_05', 'f_cen_81', 'f_occ_31', 'f_occ_de', 'f_sep_31', 'm_cen_08', 'm_occ_44', 'm_val_89']

def carrega_bsc():
    model_path = os.getcwd() + "/models/bsc/best_model.pth"
    config_path = os.getcwd() + "/models/bsc/config.json"
    speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
    vocoder_path = None
    vocoder_config_path = None

    synthesizer = Synthesizer(
        model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
    )

    return synthesizer

def carrega_collectivat():
    model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
    config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
    vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
    vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
    synthesizer = Synthesizer(
        model_path, config_path, None, None, vocoder_path, vocoder_config_path
    )

    return synthesizer


model_bsc = carrega_bsc()
SPEAKERS = model_bsc.speakers

model_collectivat = carrega_collectivat()

def tts(text, speaker_idx):
    if len(text) > MAX_TXT_LEN:
        text = text[:MAX_TXT_LEN]
        print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
    print(text)

    speakers_maping_path = os.getcwd() + "/speaker_map.json"

    # Map speaker aliases to speaker ids
    with open(speakers_maping_path, 'r') as fp:
        maping = json.load(fp)

    #speaker_idx = maping[speaker_idx]

    # synthesize
    wavs = model_bsc.tts(text, speaker_idx)
    # return output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        model_bsc.save_wav(wavs, fp)
        return fp.name


description="""
1️⃣ Introdueix el text a sintetitzar.

2️⃣ Selecciona una veu en el desplegable.

3️⃣ Gaudeix!
"""
article= ""

iface = gr.Interface(
    fn=tts,
    inputs=[
        gr.inputs.Textbox(
            label="Text",
            default="L'Èlia i l'Alí a l'aula.  L'oli i l'ou.  Lulú olorava la lila.",
        ),
        gr.inputs.Dropdown(label="Selecciona un parlant", choices=SPEAKERS, default=None)
    ],
    outputs=gr.outputs.Audio(label="Output",type="filepath"),
    title="🗣️ TTS Català Multi Parlant - VITS 🗣️",
    description=description,
    article=article,
    allow_flagging="never",
    layout="vertical",
    live=False
)
iface.launch(share=False)