Spaces:
Runtime error
Runtime error
Upload 24 files
Browse files- .gitattributes +0 -1
- .gitignore +2 -0
- Dockerfile +52 -0
- README.md +5 -7
- app.py +147 -0
- engine.py +144 -0
- festival.py +65 -0
- mms.py +84 -0
- models/bsc/best_model.pth +3 -0
- models/bsc/config.json +262 -0
- models/bsc/speaker_map.json +10 -0
- models/bsc/speakers.pth +3 -0
- models/collectivat/catotron-ona-TTS-API-entry.json +10 -0
- models/collectivat/fast-speech_best_model.pth +3 -0
- models/collectivat/fast-speech_config.json +213 -0
- models/collectivat/ljspeech--hifigan_v2_config.json +158 -0
- models/collectivat/ljspeech--hifigan_v2_model_file.pth +3 -0
- models/mms/G_100000.pth +3 -0
- models/mms/config.json +87 -0
- models/mms/vocab.txt +39 -0
- models/piper/MODEL_CARD +15 -0
- models/piper/ca-upc_ona-x-low.onnx +3 -0
- models/piper/ca-upc_ona-x-low.onnx.json +409 -0
- requirements.txt +4 -0
.gitattributes
CHANGED
@@ -25,7 +25,6 @@
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.venv
|
2 |
+
__pycache__
|
Dockerfile
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
RUN apt-get update && apt-get install -y gnupg && \
|
4 |
+
apt-key adv --recv-keys --keyserver hkp://keyserver.ubuntu.com:80 A3A48C4A && \
|
5 |
+
echo "deb http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
|
6 |
+
echo "deb-src http://ppa.launchpad.net/zeehio/festcat/ubuntu bionic main" >> /etc/apt/sources.list && \
|
7 |
+
apt-get update && \
|
8 |
+
apt-get -y install festival festvox-ca-ona-hts festvox-ca-pau-hts lame git make autoconf automake libtool pkg-config gcc libsonic-dev ronn kramdown libpcaudio-dev libatlas-base-dev gfortran
|
9 |
+
|
10 |
+
RUN git clone -b ca-to-pr https://github.com/projecte-aina/espeak-ng
|
11 |
+
|
12 |
+
RUN cd espeak-ng && \
|
13 |
+
./autogen.sh && \
|
14 |
+
./configure --prefix=/usr && \
|
15 |
+
make && \
|
16 |
+
make install
|
17 |
+
|
18 |
+
RUN useradd -m -u 1000 user
|
19 |
+
|
20 |
+
USER user
|
21 |
+
|
22 |
+
|
23 |
+
ENV HOME=/home/user \
|
24 |
+
PATH=/home/user/.local/bin:$PATH
|
25 |
+
|
26 |
+
# Set the working directory to the user's home directory
|
27 |
+
WORKDIR $HOME/app
|
28 |
+
|
29 |
+
COPY --chown=user requirements.txt .
|
30 |
+
COPY --chown=user models models
|
31 |
+
|
32 |
+
RUN pip install -r requirements.txt
|
33 |
+
|
34 |
+
RUN git clone https://github.com/jaywalnut310/vits.git && \
|
35 |
+
cd vits && sed s/torch==1.6.0/torch==1.7.0/ requirements.txt > requirements.txt && pip install -r requirements.txt && cd monotonic_align && \
|
36 |
+
python setup.py build_ext --inplace && cd /home/user
|
37 |
+
|
38 |
+
ENV PYTHONPATH=$PYTHONPATH:/home/user/app/vits
|
39 |
+
|
40 |
+
COPY --chown=user engine.py .
|
41 |
+
COPY --chown=user mms.py .
|
42 |
+
COPY --chown=user festival.py .
|
43 |
+
COPY --chown=user app.py .
|
44 |
+
|
45 |
+
RUN mkdir -p cache && chmod 777 cache
|
46 |
+
|
47 |
+
ENV NUMBA_CACHE_DIR=/home/user/cache
|
48 |
+
ENV MPLCONFIGDIR=/home/user/cache
|
49 |
+
|
50 |
+
EXPOSE 7860
|
51 |
+
|
52 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
sdk_version: 3.35.2
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
1 |
---
|
2 |
+
title: Síntesi en català
|
3 |
+
emoji: 👁
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: red
|
6 |
+
sdk: docker
|
|
|
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
app.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tempfile
|
2 |
+
import gradio as gr
|
3 |
+
import os
|
4 |
+
from TTS.utils.synthesizer import Synthesizer
|
5 |
+
from espeak_phonemizer import Phonemizer
|
6 |
+
from engine import Piper
|
7 |
+
from festival import festival_synthesize
|
8 |
+
from mms import MMS
|
9 |
+
|
10 |
+
MAX_TXT_LEN = 325
|
11 |
+
|
12 |
+
fonemitzador = Phonemizer("ca")
|
13 |
+
|
14 |
+
def carrega_bsc():
|
15 |
+
model_path = os.getcwd() + "/models/bsc/best_model.pth"
|
16 |
+
config_path = os.getcwd() + "/models/bsc/config.json"
|
17 |
+
speakers_file_path = os.getcwd() + "/models/bsc/speakers.pth"
|
18 |
+
vocoder_path = None
|
19 |
+
vocoder_config_path = None
|
20 |
+
|
21 |
+
synthesizer = Synthesizer(
|
22 |
+
model_path, config_path, speakers_file_path, None, vocoder_path, vocoder_config_path,
|
23 |
+
)
|
24 |
+
|
25 |
+
return synthesizer
|
26 |
+
|
27 |
+
def carrega_collectivat():
|
28 |
+
model_path = os.getcwd() + "/models/collectivat/fast-speech_best_model.pth"
|
29 |
+
config_path = os.getcwd() + "/models/collectivat/fast-speech_config.json"
|
30 |
+
vocoder_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_model_file.pth"
|
31 |
+
vocoder_config_path = os.getcwd() + "/models/collectivat/ljspeech--hifigan_v2_config.json"
|
32 |
+
synthesizer = Synthesizer(
|
33 |
+
model_path, config_path, None, None, vocoder_path, vocoder_config_path
|
34 |
+
)
|
35 |
+
|
36 |
+
return synthesizer
|
37 |
+
|
38 |
+
def carrega_piper():
|
39 |
+
return Piper(os.getcwd() + "/models/piper/ca-upc_ona-x-low.onnx")
|
40 |
+
|
41 |
+
def carrega_mms():
|
42 |
+
return MMS(os.getcwd() + "/models/mms")
|
43 |
+
|
44 |
+
|
45 |
+
model_bsc = carrega_bsc()
|
46 |
+
SPEAKERS = model_bsc.tts_model.speaker_manager.speaker_names
|
47 |
+
|
48 |
+
model_collectivat = carrega_collectivat()
|
49 |
+
|
50 |
+
model_piper = carrega_piper()
|
51 |
+
|
52 |
+
model_mms = carrega_mms()
|
53 |
+
|
54 |
+
request_count = 0
|
55 |
+
|
56 |
+
def tts(text, festival_voice, speaker_idx):
|
57 |
+
if len(text) > MAX_TXT_LEN:
|
58 |
+
text = text[:MAX_TXT_LEN]
|
59 |
+
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
|
60 |
+
print(text)
|
61 |
+
|
62 |
+
# synthesize
|
63 |
+
wav_bsc = model_bsc.tts(text, speaker_idx)
|
64 |
+
wav_coll = model_collectivat.tts(text)
|
65 |
+
wav_piper = model_piper.synthesize(text)
|
66 |
+
|
67 |
+
fp_bsc = ""
|
68 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
69 |
+
model_bsc.save_wav(wav_bsc, fp)
|
70 |
+
fp_bsc = fp.name
|
71 |
+
|
72 |
+
fp_coll = ""
|
73 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
74 |
+
model_collectivat.save_wav(wav_coll, fp)
|
75 |
+
fp_coll = fp.name
|
76 |
+
|
77 |
+
fp_piper = ""
|
78 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
79 |
+
fp.write(wav_piper)
|
80 |
+
fp_piper = fp.name
|
81 |
+
|
82 |
+
fp_mms = ""
|
83 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
84 |
+
model_mms.synthesize(fp.name, text)
|
85 |
+
fp_mms = fp.name
|
86 |
+
|
87 |
+
fonemes = fonemitzador.phonemize(text, keep_clause_breakers=True)
|
88 |
+
|
89 |
+
fp_festival = festival_synthesize(text, festival_voice)
|
90 |
+
|
91 |
+
global request_count
|
92 |
+
request_count += 1
|
93 |
+
print(f"Requests: {request_count}")
|
94 |
+
return fonemes, fp_festival, fp_bsc, fp_coll, fp_piper, fp_mms
|
95 |
+
|
96 |
+
|
97 |
+
description="""
|
98 |
+
Amb aquesta aplicació podeu sintetitzar text a veu amb els últims models neuronals lliures pel català i amb el motor Festival.
|
99 |
+
|
100 |
+
1. Model multi-parlant VITS entrenat pel BSC (Projecte Aina) [enllaç](https://huggingface.co/projecte-aina/tts-ca-coqui-vits-multispeaker)
|
101 |
+
2. Model Fastspeech entrenat per Col·lectivat [enllaç](https://github.com/CollectivaT-dev/TTS-API)
|
102 |
+
3. Model VITS entrenat per Piper/Home Assistant [enllaç](https://github.com/rhasspy/piper)
|
103 |
+
3. Model VITS entrenat per Meta (llicència CC-BY-NC) [enllaç](https://github.com/facebookresearch/fairseq/tree/main/examples/mms)
|
104 |
+
|
105 |
+
El primer model ha estat entrenat amb totes les veus de FestCAT, els talls de Common Voice 8 i un altre corpus pel que conté moltes veus de qualitat variable. La veu d'Ona està seleccionada per defecte per la comparativa però podeu provar les altres.
|
106 |
+
Els models 2 i 3 han estat entrenats amb la veu d'Ona de FestCAT.
|
107 |
+
El model 4, anomenat MMS, de Meta (Facebook) ha estat entrenat a partir de dades d'un [audiollibre](http://live.bible.is/bible/CATBSS/LUK/1) de la Bíblia
|
108 |
+
|
109 |
+
Aquesta aplicació fa servir l'últim estat de l'espeak millorat per Carme Armentano del BSC
|
110 |
+
https://github.com/projecte-aina/espeak-ng
|
111 |
+
|
112 |
+
NOTA: El model de col·lectivat treballa amb grafemes pel que no fa servir espeak com a fonemitzador. Festival conté les seves pròpies normes fonètiques.
|
113 |
+
"""
|
114 |
+
article= ""
|
115 |
+
|
116 |
+
iface = gr.Interface(
|
117 |
+
fn=tts,
|
118 |
+
inputs=[
|
119 |
+
gr.Textbox(
|
120 |
+
label="Text",
|
121 |
+
value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
|
122 |
+
),
|
123 |
+
gr.Dropdown(label="Parlant del motor Festival", choices=["ona", "pau"], value="ona"),
|
124 |
+
gr.Dropdown(label="Parlant del model VITS multi-parlant del BSC", choices=SPEAKERS, value="ona")
|
125 |
+
],
|
126 |
+
outputs=[
|
127 |
+
gr.Markdown(label="Fonemes"),
|
128 |
+
gr.Audio(label="Festival",type="filepath"),
|
129 |
+
gr.Audio(label="BSC VITS",type="filepath"),
|
130 |
+
gr.Audio(label="Collectivat Fastspeech",type="filepath"),
|
131 |
+
gr.Audio(label="Piper VITS",type="filepath"),
|
132 |
+
gr.Audio(label="Meta MMS VITS",type="filepath")
|
133 |
+
],
|
134 |
+
title="Comparativa de síntesi lliure en català️",
|
135 |
+
description=description,
|
136 |
+
article=article,
|
137 |
+
allow_flagging="never",
|
138 |
+
layout="vertical",
|
139 |
+
live=False,
|
140 |
+
examples=[
|
141 |
+
["Duc pà sec al sac, m'assec on sóc i el suco amb suc", "ona", "ona"],
|
142 |
+
["Un plat pla blanc, ple de pebre negre n’era. Un plat blanc pla, ple de pebre negre està", "ona", "ona"],
|
143 |
+
["Visc al bosc i busco vesc i visc del vesc que busco al bosc", "ona", "ona"],
|
144 |
+
["Una polla xica, pica, pellarica, camatorta i becarica va tenir sis polls xics, pics, pellarics, camacurts i becarics. Si la polla no hagués sigut xica, pica, pellarica, camatorta i becarica, els sis polls no haurien sigut xics, pics, pellarics, camacurts i becarics.", "ona", "ona"]
|
145 |
+
]
|
146 |
+
)
|
147 |
+
iface.launch(server_name="0.0.0.0", server_port=7860)
|
engine.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import wave
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from pathlib import Path
|
7 |
+
from typing import List, Mapping, Optional, Sequence, Union
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import onnxruntime
|
11 |
+
from espeak_phonemizer import Phonemizer
|
12 |
+
|
13 |
+
_BOS = "^"
|
14 |
+
_EOS = "$"
|
15 |
+
_PAD = "_"
|
16 |
+
|
17 |
+
|
18 |
+
@dataclass
|
19 |
+
class PiperConfig:
|
20 |
+
num_symbols: int
|
21 |
+
num_speakers: int
|
22 |
+
sample_rate: int
|
23 |
+
espeak_voice: str
|
24 |
+
length_scale: float
|
25 |
+
noise_scale: float
|
26 |
+
noise_w: float
|
27 |
+
phoneme_id_map: Mapping[str, Sequence[int]]
|
28 |
+
|
29 |
+
|
30 |
+
class Piper:
|
31 |
+
def __init__(
|
32 |
+
self,
|
33 |
+
model_path: Union[str, Path],
|
34 |
+
config_path: Optional[Union[str, Path]] = None,
|
35 |
+
use_cuda: bool = False,
|
36 |
+
):
|
37 |
+
if config_path is None:
|
38 |
+
config_path = f"{model_path}.json"
|
39 |
+
|
40 |
+
self.config = load_config(config_path)
|
41 |
+
self.phonemizer = Phonemizer(self.config.espeak_voice)
|
42 |
+
self.onnx_options = onnxruntime.SessionOptions()
|
43 |
+
self.onnx_options.intra_op_num_threads = os.cpu_count() - 1
|
44 |
+
self.model = onnxruntime.InferenceSession(
|
45 |
+
str(model_path),
|
46 |
+
sess_options=self.onnx_options,
|
47 |
+
providers=["CPUExecutionProvider"]
|
48 |
+
if not use_cuda
|
49 |
+
else ["CUDAExecutionProvider"],
|
50 |
+
)
|
51 |
+
|
52 |
+
def synthesize(
|
53 |
+
self,
|
54 |
+
text: str,
|
55 |
+
speaker_id: Optional[int] = None,
|
56 |
+
length_scale: Optional[float] = None,
|
57 |
+
noise_scale: Optional[float] = None,
|
58 |
+
noise_w: Optional[float] = None,
|
59 |
+
) -> bytes:
|
60 |
+
"""Synthesize WAV audio from text."""
|
61 |
+
if length_scale is None:
|
62 |
+
length_scale = self.config.length_scale
|
63 |
+
|
64 |
+
if noise_scale is None:
|
65 |
+
noise_scale = self.config.noise_scale
|
66 |
+
|
67 |
+
if noise_w is None:
|
68 |
+
noise_w = self.config.noise_w
|
69 |
+
|
70 |
+
phonemes_str = self.phonemizer.phonemize(text, keep_clause_breakers=True)
|
71 |
+
phonemes = [_BOS] + list(phonemes_str)
|
72 |
+
phoneme_ids: List[int] = []
|
73 |
+
|
74 |
+
for phoneme in phonemes:
|
75 |
+
phoneme_ids.extend(self.config.phoneme_id_map[phoneme])
|
76 |
+
phoneme_ids.extend(self.config.phoneme_id_map[_PAD])
|
77 |
+
|
78 |
+
phoneme_ids.extend(self.config.phoneme_id_map[_EOS])
|
79 |
+
|
80 |
+
phoneme_ids_array = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
81 |
+
phoneme_ids_lengths = np.array([phoneme_ids_array.shape[1]], dtype=np.int64)
|
82 |
+
scales = np.array(
|
83 |
+
[noise_scale, length_scale, noise_w],
|
84 |
+
dtype=np.float32,
|
85 |
+
)
|
86 |
+
|
87 |
+
if (self.config.num_speakers > 1) and (speaker_id is not None):
|
88 |
+
# Default speaker
|
89 |
+
speaker_id = 0
|
90 |
+
|
91 |
+
sid = None
|
92 |
+
|
93 |
+
if speaker_id is not None:
|
94 |
+
sid = np.array([speaker_id], dtype=np.int64)
|
95 |
+
|
96 |
+
# Synthesize through Onnx
|
97 |
+
audio = self.model.run(
|
98 |
+
None,
|
99 |
+
{
|
100 |
+
"input": phoneme_ids_array,
|
101 |
+
"input_lengths": phoneme_ids_lengths,
|
102 |
+
"scales": scales,
|
103 |
+
"sid": sid,
|
104 |
+
},
|
105 |
+
)[0].squeeze((0, 1))
|
106 |
+
audio = audio_float_to_int16(audio.squeeze())
|
107 |
+
|
108 |
+
# Convert to WAV
|
109 |
+
with io.BytesIO() as wav_io:
|
110 |
+
wav_file: wave.Wave_write = wave.open(wav_io, "wb")
|
111 |
+
with wav_file:
|
112 |
+
wav_file.setframerate(self.config.sample_rate)
|
113 |
+
wav_file.setsampwidth(2)
|
114 |
+
wav_file.setnchannels(1)
|
115 |
+
wav_file.writeframes(audio.tobytes())
|
116 |
+
|
117 |
+
return wav_io.getvalue()
|
118 |
+
|
119 |
+
|
120 |
+
def load_config(config_path: Union[str, Path]) -> PiperConfig:
|
121 |
+
with open(config_path, "r", encoding="utf-8") as config_file:
|
122 |
+
config_dict = json.load(config_file)
|
123 |
+
inference = config_dict.get("inference", {})
|
124 |
+
|
125 |
+
return PiperConfig(
|
126 |
+
num_symbols=config_dict["num_symbols"],
|
127 |
+
num_speakers=config_dict["num_speakers"],
|
128 |
+
sample_rate=config_dict["audio"]["sample_rate"],
|
129 |
+
espeak_voice=config_dict["espeak"]["voice"],
|
130 |
+
noise_scale=inference.get("noise_scale", 0.667),
|
131 |
+
length_scale=inference.get("length_scale", 1.0),
|
132 |
+
noise_w=inference.get("noise_w", 0.8),
|
133 |
+
phoneme_id_map=config_dict["phoneme_id_map"],
|
134 |
+
)
|
135 |
+
|
136 |
+
|
137 |
+
def audio_float_to_int16(
|
138 |
+
audio: np.ndarray, max_wav_value: float = 32767.0
|
139 |
+
) -> np.ndarray:
|
140 |
+
"""Normalize audio and convert to int16 range"""
|
141 |
+
audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
|
142 |
+
audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
|
143 |
+
audio_norm = audio_norm.astype("int16")
|
144 |
+
return audio_norm
|
festival.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- encoding: utf-8 -*-
|
3 |
+
#
|
4 |
+
# Copyright (c) 2016 Jordi Mas i Hernandez <jmas@softcatala.org>
|
5 |
+
#
|
6 |
+
# This program is free software; you can redistribute it and/or
|
7 |
+
# modify it under the terms of the GNU Lesser General Public
|
8 |
+
# License as published by the Free Software Foundation; either
|
9 |
+
# version 2.1 of the License, or (at your option) any later version.
|
10 |
+
#
|
11 |
+
# This program is distributed in the hope that it will be useful,
|
12 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
+
# Lesser General Public License for more details.
|
15 |
+
#
|
16 |
+
# You should have received a copy of the GNU Lesser General Public
|
17 |
+
# License along with this program; if not, write to the
|
18 |
+
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
19 |
+
# Boston, MA 02111-1307, USA.
|
20 |
+
|
21 |
+
import subprocess
|
22 |
+
import tempfile
|
23 |
+
|
24 |
+
festival_voices = {
|
25 |
+
"ona": "voice_upc_ca_ona_hts",
|
26 |
+
"pau": "voice_upc_ca_pau_hts"
|
27 |
+
}
|
28 |
+
|
29 |
+
def _normalize(result):
|
30 |
+
mapping = {
|
31 |
+
'’' : '\'',
|
32 |
+
'à' : 'à',
|
33 |
+
'í' : 'í',
|
34 |
+
'ó' : 'ó',
|
35 |
+
'è' : 'è',
|
36 |
+
'ò' : 'ò',
|
37 |
+
'ú' : 'ú',
|
38 |
+
}
|
39 |
+
|
40 |
+
for char in mapping.keys():
|
41 |
+
result = result.replace(char, mapping[char])
|
42 |
+
|
43 |
+
return result
|
44 |
+
|
45 |
+
|
46 |
+
def festival_synthesize(text, voice):
|
47 |
+
if voice not in ["ona", "pau"]:
|
48 |
+
raise Error
|
49 |
+
|
50 |
+
txt2wave = '/usr/bin/text2wave'
|
51 |
+
|
52 |
+
with tempfile.NamedTemporaryFile() as encoded_file,\
|
53 |
+
tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wave_file:
|
54 |
+
|
55 |
+
text = _normalize(text)
|
56 |
+
f = open(encoded_file.name, 'wb')
|
57 |
+
f.write(text.encode('ISO-8859-15', 'ignore'))
|
58 |
+
f.close()
|
59 |
+
|
60 |
+
cmd = '{0} -o {1} {2} -eval "({3})"'.\
|
61 |
+
format(txt2wave, wave_file.name, encoded_file.name, festival_voices[voice])
|
62 |
+
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
|
63 |
+
p.wait()
|
64 |
+
|
65 |
+
return wave_file.name
|
mms.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
import os
|
7 |
+
import torch
|
8 |
+
import commons
|
9 |
+
import utils
|
10 |
+
from models import SynthesizerTrn
|
11 |
+
from scipy.io.wavfile import write
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import Union
|
14 |
+
|
15 |
+
class TextMapper(object):
|
16 |
+
def __init__(self, vocab_file):
|
17 |
+
self.symbols = [x.replace("\n", "") for x in open(vocab_file).readlines()]
|
18 |
+
self.SPACE_ID = self.symbols.index(" ")
|
19 |
+
self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
|
20 |
+
self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
|
21 |
+
|
22 |
+
def text_to_sequence(self, text, cleaner_names):
|
23 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
24 |
+
Args:
|
25 |
+
text: string to convert to a sequence
|
26 |
+
cleaner_names: names of the cleaner functions to run the text through
|
27 |
+
Returns:
|
28 |
+
List of integers corresponding to the symbols in the text
|
29 |
+
'''
|
30 |
+
sequence = []
|
31 |
+
clean_text = text.strip()
|
32 |
+
for symbol in clean_text:
|
33 |
+
symbol_id = self._symbol_to_id[symbol]
|
34 |
+
sequence += [symbol_id]
|
35 |
+
return sequence
|
36 |
+
|
37 |
+
def get_text(self, text, hps):
|
38 |
+
text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
|
39 |
+
if hps.data.add_blank:
|
40 |
+
text_norm = commons.intersperse(text_norm, 0)
|
41 |
+
text_norm = torch.LongTensor(text_norm)
|
42 |
+
return text_norm
|
43 |
+
|
44 |
+
def filter_oov(self, text):
|
45 |
+
val_chars = self._symbol_to_id
|
46 |
+
txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
|
47 |
+
print(f"text after filtering OOV: {txt_filt}")
|
48 |
+
return txt_filt
|
49 |
+
|
50 |
+
class MMS():
|
51 |
+
def __init__(self, model_path: Union[str, Path]):
|
52 |
+
ckpt_dir = model_path
|
53 |
+
vocab_file = f"{ckpt_dir}/vocab.txt"
|
54 |
+
config_file = f"{ckpt_dir}/config.json"
|
55 |
+
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
|
56 |
+
self.hps = utils.get_hparams_from_file(config_file)
|
57 |
+
self.text_mapper = TextMapper(vocab_file)
|
58 |
+
self.net_g = SynthesizerTrn(
|
59 |
+
len(self.text_mapper.symbols),
|
60 |
+
self.hps.data.filter_length // 2 + 1,
|
61 |
+
self.hps.train.segment_size // self.hps.data.hop_length,
|
62 |
+
**self.hps.model)
|
63 |
+
g_pth = f"{ckpt_dir}/G_100000.pth"
|
64 |
+
print(f"load {g_pth}")
|
65 |
+
|
66 |
+
_ = utils.load_checkpoint(g_pth, self.net_g, None)
|
67 |
+
|
68 |
+
def synthesize(self, wav_path: str, txt):
|
69 |
+
print(f"text: {txt}")
|
70 |
+
txt = txt.lower()
|
71 |
+
txt = self.text_mapper.filter_oov(txt)
|
72 |
+
stn_tst = self.text_mapper.get_text(txt, self.hps)
|
73 |
+
with torch.no_grad():
|
74 |
+
x_tst = stn_tst.unsqueeze(0)
|
75 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
76 |
+
hyp = self.net_g.infer(
|
77 |
+
x_tst, x_tst_lengths, noise_scale=.667,
|
78 |
+
noise_scale_w=0.8, length_scale=1.0
|
79 |
+
)[0][0,0].cpu().float().numpy()
|
80 |
+
|
81 |
+
os.makedirs(os.path.dirname(wav_path), exist_ok=True)
|
82 |
+
print(f"wav: {wav_path}")
|
83 |
+
write(wav_path, self.hps.data.sampling_rate, hyp)
|
84 |
+
return
|
models/bsc/best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
|
3 |
+
size 1038659133
|
models/bsc/config.json
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
|
3 |
+
"logger_uri": null,
|
4 |
+
"run_name": "multispeaker_vits_ca_1e4_1e4_32",
|
5 |
+
"project_name": null,
|
6 |
+
"run_description": "\ud83d\udc38Coqui trainer run.",
|
7 |
+
"print_step": 25,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "tensorboard",
|
12 |
+
"log_model_step": 1000,
|
13 |
+
"save_step": 1000,
|
14 |
+
"save_n_checkpoints": 5,
|
15 |
+
"save_checkpoints": true,
|
16 |
+
"save_all_best": true,
|
17 |
+
"save_best_after": 10000,
|
18 |
+
"target_loss": null,
|
19 |
+
"print_eval": true,
|
20 |
+
"test_delay_epochs": -1,
|
21 |
+
"run_eval": true,
|
22 |
+
"run_eval_steps": null,
|
23 |
+
"distributed_backend": "nccl",
|
24 |
+
"distributed_url": "tcp://localhost:54321",
|
25 |
+
"mixed_precision": false,
|
26 |
+
"epochs": 1000,
|
27 |
+
"batch_size": 16,
|
28 |
+
"eval_batch_size": 8,
|
29 |
+
"grad_clip": [
|
30 |
+
1000.0,
|
31 |
+
1000.0
|
32 |
+
],
|
33 |
+
"scheduler_after_epoch": true,
|
34 |
+
"lr": 0.001,
|
35 |
+
"optimizer": "AdamW",
|
36 |
+
"optimizer_params": {
|
37 |
+
"betas": [
|
38 |
+
0.8,
|
39 |
+
0.99
|
40 |
+
],
|
41 |
+
"eps": 1e-09,
|
42 |
+
"weight_decay": 0.01
|
43 |
+
},
|
44 |
+
"lr_scheduler": "",
|
45 |
+
"lr_scheduler_params": null,
|
46 |
+
"use_grad_scaler": false,
|
47 |
+
"cudnn_enable": true,
|
48 |
+
"cudnn_deterministic": false,
|
49 |
+
"cudnn_benchmark": false,
|
50 |
+
"training_seed": 54321,
|
51 |
+
"model": "vits",
|
52 |
+
"num_loader_workers": 4,
|
53 |
+
"num_eval_loader_workers": 4,
|
54 |
+
"use_noise_augment": false,
|
55 |
+
"audio": {
|
56 |
+
"fft_size": 1024,
|
57 |
+
"sample_rate": 22050,
|
58 |
+
"win_length": 1024,
|
59 |
+
"hop_length": 256,
|
60 |
+
"num_mels": 80,
|
61 |
+
"mel_fmin": 0,
|
62 |
+
"mel_fmax": null
|
63 |
+
},
|
64 |
+
"use_phonemes": true,
|
65 |
+
"phonemizer": "espeak",
|
66 |
+
"phoneme_language": "ca",
|
67 |
+
"compute_input_seq_cache": true,
|
68 |
+
"text_cleaner": "multilingual_cleaners",
|
69 |
+
"enable_eos_bos_chars": false,
|
70 |
+
"test_sentences_file": "",
|
71 |
+
"phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
|
72 |
+
"characters": {
|
73 |
+
"characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
|
74 |
+
"vocab_dict": null,
|
75 |
+
"pad": "<PAD>",
|
76 |
+
"eos": "<EOS>",
|
77 |
+
"bos": "<BOS>",
|
78 |
+
"blank": "<BLNK>",
|
79 |
+
"characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
|
80 |
+
"punctuations": "!'(),-.:;? ",
|
81 |
+
"phonemes": null,
|
82 |
+
"is_unique": false,
|
83 |
+
"is_sorted": true
|
84 |
+
},
|
85 |
+
"add_blank": true,
|
86 |
+
"batch_group_size": 5,
|
87 |
+
"loss_masking": null,
|
88 |
+
"min_audio_len": 1,
|
89 |
+
"max_audio_len": Infinity,
|
90 |
+
"min_text_len": 1,
|
91 |
+
"max_text_len": 325,
|
92 |
+
"compute_f0": false,
|
93 |
+
"compute_linear_spec": true,
|
94 |
+
"precompute_num_workers": 0,
|
95 |
+
"start_by_longest": false,
|
96 |
+
"datasets": [
|
97 |
+
{
|
98 |
+
"formatter": "vctk_old",
|
99 |
+
"dataset_name": "vctk_old",
|
100 |
+
"path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
|
101 |
+
"meta_file_train": "",
|
102 |
+
"ignored_speakers": [
|
103 |
+
"uri",
|
104 |
+
"09796",
|
105 |
+
"05450"
|
106 |
+
],
|
107 |
+
"language": "ca",
|
108 |
+
"meta_file_val": "",
|
109 |
+
"meta_file_attn_mask": ""
|
110 |
+
}
|
111 |
+
],
|
112 |
+
"test_sentences": [
|
113 |
+
[
|
114 |
+
"Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
|
115 |
+
],
|
116 |
+
[
|
117 |
+
"Preguntin-se si aix\u00f2 era necessari."
|
118 |
+
],
|
119 |
+
[
|
120 |
+
"La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
|
121 |
+
],
|
122 |
+
[
|
123 |
+
"\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
|
124 |
+
]
|
125 |
+
],
|
126 |
+
"eval_split_max_size": null,
|
127 |
+
"eval_split_size": 0.01,
|
128 |
+
"use_speaker_weighted_sampler": false,
|
129 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
130 |
+
"use_language_weighted_sampler": false,
|
131 |
+
"language_weighted_sampler_alpha": 1.0,
|
132 |
+
"use_length_weighted_sampler": false,
|
133 |
+
"length_weighted_sampler_alpha": 1.0,
|
134 |
+
"model_args": {
|
135 |
+
"num_chars": 131,
|
136 |
+
"out_channels": 513,
|
137 |
+
"spec_segment_size": 32,
|
138 |
+
"hidden_channels": 192,
|
139 |
+
"hidden_channels_ffn_text_encoder": 768,
|
140 |
+
"num_heads_text_encoder": 2,
|
141 |
+
"num_layers_text_encoder": 6,
|
142 |
+
"kernel_size_text_encoder": 3,
|
143 |
+
"dropout_p_text_encoder": 0.1,
|
144 |
+
"dropout_p_duration_predictor": 0.5,
|
145 |
+
"kernel_size_posterior_encoder": 5,
|
146 |
+
"dilation_rate_posterior_encoder": 1,
|
147 |
+
"num_layers_posterior_encoder": 16,
|
148 |
+
"kernel_size_flow": 5,
|
149 |
+
"dilation_rate_flow": 1,
|
150 |
+
"num_layers_flow": 4,
|
151 |
+
"resblock_type_decoder": "1",
|
152 |
+
"resblock_kernel_sizes_decoder": [
|
153 |
+
3,
|
154 |
+
7,
|
155 |
+
11
|
156 |
+
],
|
157 |
+
"resblock_dilation_sizes_decoder": [
|
158 |
+
[
|
159 |
+
1,
|
160 |
+
3,
|
161 |
+
5
|
162 |
+
],
|
163 |
+
[
|
164 |
+
1,
|
165 |
+
3,
|
166 |
+
5
|
167 |
+
],
|
168 |
+
[
|
169 |
+
1,
|
170 |
+
3,
|
171 |
+
5
|
172 |
+
]
|
173 |
+
],
|
174 |
+
"upsample_rates_decoder": [
|
175 |
+
8,
|
176 |
+
8,
|
177 |
+
2,
|
178 |
+
2
|
179 |
+
],
|
180 |
+
"upsample_initial_channel_decoder": 512,
|
181 |
+
"upsample_kernel_sizes_decoder": [
|
182 |
+
16,
|
183 |
+
16,
|
184 |
+
4,
|
185 |
+
4
|
186 |
+
],
|
187 |
+
"periods_multi_period_discriminator": [
|
188 |
+
2,
|
189 |
+
3,
|
190 |
+
5,
|
191 |
+
7,
|
192 |
+
11
|
193 |
+
],
|
194 |
+
"use_sdp": true,
|
195 |
+
"noise_scale": 1.0,
|
196 |
+
"inference_noise_scale": 0.667,
|
197 |
+
"length_scale": 1.0,
|
198 |
+
"noise_scale_dp": 1.0,
|
199 |
+
"inference_noise_scale_dp": 1.0,
|
200 |
+
"max_inference_len": null,
|
201 |
+
"init_discriminator": true,
|
202 |
+
"use_spectral_norm_disriminator": false,
|
203 |
+
"use_speaker_embedding": true,
|
204 |
+
"num_speakers": 257,
|
205 |
+
"speakers_file": "/home/user/app/models/bsc/speakers.pth",
|
206 |
+
"d_vector_file": null,
|
207 |
+
"speaker_embedding_channels": 256,
|
208 |
+
"use_d_vector_file": false,
|
209 |
+
"d_vector_dim": 0,
|
210 |
+
"detach_dp_input": true,
|
211 |
+
"use_language_embedding": false,
|
212 |
+
"embedded_language_dim": 4,
|
213 |
+
"num_languages": 0,
|
214 |
+
"language_ids_file": null,
|
215 |
+
"use_speaker_encoder_as_loss": false,
|
216 |
+
"speaker_encoder_config_path": "",
|
217 |
+
"speaker_encoder_model_path": "",
|
218 |
+
"condition_dp_on_speaker": true,
|
219 |
+
"freeze_encoder": false,
|
220 |
+
"freeze_DP": false,
|
221 |
+
"freeze_PE": false,
|
222 |
+
"freeze_flow_decoder": false,
|
223 |
+
"freeze_waveform_decoder": false,
|
224 |
+
"encoder_sample_rate": null,
|
225 |
+
"interpolate_z": true,
|
226 |
+
"reinit_DP": false,
|
227 |
+
"reinit_text_encoder": false
|
228 |
+
},
|
229 |
+
"lr_gen": 0.0001,
|
230 |
+
"lr_disc": 0.0001,
|
231 |
+
"lr_scheduler_gen": "ExponentialLR",
|
232 |
+
"lr_scheduler_gen_params": {
|
233 |
+
"gamma": 0.999875,
|
234 |
+
"last_epoch": -1
|
235 |
+
},
|
236 |
+
"lr_scheduler_disc": "ExponentialLR",
|
237 |
+
"lr_scheduler_disc_params": {
|
238 |
+
"gamma": 0.999875,
|
239 |
+
"last_epoch": -1
|
240 |
+
},
|
241 |
+
"kl_loss_alpha": 1.0,
|
242 |
+
"disc_loss_alpha": 1.0,
|
243 |
+
"gen_loss_alpha": 1.0,
|
244 |
+
"feat_loss_alpha": 1.0,
|
245 |
+
"mel_loss_alpha": 45.0,
|
246 |
+
"dur_loss_alpha": 1.0,
|
247 |
+
"speaker_encoder_loss_alpha": 1.0,
|
248 |
+
"return_wav": true,
|
249 |
+
"use_weighted_sampler": false,
|
250 |
+
"weighted_sampler_attrs": null,
|
251 |
+
"weighted_sampler_multipliers": null,
|
252 |
+
"r": 1,
|
253 |
+
"num_speakers": 257,
|
254 |
+
"use_speaker_embedding": true,
|
255 |
+
"speakers_file": "/home/user/app/models/bsc/speakers.pth",
|
256 |
+
"speaker_embedding_channels": 256,
|
257 |
+
"language_ids_file": null,
|
258 |
+
"use_language_embedding": false,
|
259 |
+
"use_d_vector_file": false,
|
260 |
+
"d_vector_file": null,
|
261 |
+
"d_vector_dim": 0
|
262 |
+
}
|
models/bsc/speaker_map.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"f_cen_05": "05739",
|
3 |
+
"f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
|
4 |
+
"f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
|
5 |
+
"f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
|
6 |
+
"f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
|
7 |
+
"m_cen_08": "08935",
|
8 |
+
"m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
|
9 |
+
"m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
|
10 |
+
}
|
models/bsc/speakers.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
|
3 |
+
size 30191
|
models/collectivat/catotron-ona-TTS-API-entry.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"voice": "ona-fast-hifigan",
|
3 |
+
"lang": "ca",
|
4 |
+
"model_type": "coqui",
|
5 |
+
"tts_config_path": "fast-speech_config.json",
|
6 |
+
"tts_model_path": "fast-speech_best_model.pth",
|
7 |
+
"vocoder_config_path": "ljspeech--hifigan_v2_config.json",
|
8 |
+
"vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
|
9 |
+
"load": true
|
10 |
+
}
|
models/collectivat/fast-speech_best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
|
3 |
+
size 457921637
|
models/collectivat/fast-speech_config.json
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
|
3 |
+
"logger_uri": null,
|
4 |
+
"run_name": "fast_pitch_ljspeech",
|
5 |
+
"project_name": null,
|
6 |
+
"run_description": "\ud83d\udc38Coqui trainer run.",
|
7 |
+
"print_step": 50,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "tensorboard",
|
12 |
+
"log_model_step": null,
|
13 |
+
"save_step": 10000,
|
14 |
+
"save_n_checkpoints": 5,
|
15 |
+
"save_checkpoints": true,
|
16 |
+
"save_all_best": false,
|
17 |
+
"save_best_after": 1000,
|
18 |
+
"target_loss": null,
|
19 |
+
"print_eval": false,
|
20 |
+
"test_delay_epochs": -1,
|
21 |
+
"run_eval": true,
|
22 |
+
"run_eval_steps": null,
|
23 |
+
"distributed_backend": "nccl",
|
24 |
+
"distributed_url": "tcp://localhost:54321",
|
25 |
+
"mixed_precision": false,
|
26 |
+
"epochs": 1000,
|
27 |
+
"batch_size": 16,
|
28 |
+
"eval_batch_size": 16,
|
29 |
+
"grad_clip": 5.0,
|
30 |
+
"scheduler_after_epoch": true,
|
31 |
+
"lr": 0.0001,
|
32 |
+
"optimizer": "Adam",
|
33 |
+
"optimizer_params": {
|
34 |
+
"betas": [
|
35 |
+
0.9,
|
36 |
+
0.998
|
37 |
+
],
|
38 |
+
"weight_decay": 1e-06
|
39 |
+
},
|
40 |
+
"lr_scheduler": "NoamLR",
|
41 |
+
"lr_scheduler_params": {
|
42 |
+
"warmup_steps": 4000
|
43 |
+
},
|
44 |
+
"use_grad_scaler": false,
|
45 |
+
"cudnn_enable": true,
|
46 |
+
"cudnn_deterministic": false,
|
47 |
+
"cudnn_benchmark": false,
|
48 |
+
"training_seed": 54321,
|
49 |
+
"model": "fast_pitch",
|
50 |
+
"num_loader_workers": 8,
|
51 |
+
"num_eval_loader_workers": 4,
|
52 |
+
"use_noise_augment": false,
|
53 |
+
"audio": {
|
54 |
+
"fft_size": 1024,
|
55 |
+
"win_length": 1024,
|
56 |
+
"hop_length": 256,
|
57 |
+
"frame_shift_ms": null,
|
58 |
+
"frame_length_ms": null,
|
59 |
+
"stft_pad_mode": "reflect",
|
60 |
+
"sample_rate": 22050,
|
61 |
+
"resample": false,
|
62 |
+
"preemphasis": 0.0,
|
63 |
+
"ref_level_db": 20,
|
64 |
+
"do_sound_norm": false,
|
65 |
+
"log_func": "np.log",
|
66 |
+
"do_trim_silence": true,
|
67 |
+
"trim_db": 60.0,
|
68 |
+
"do_rms_norm": false,
|
69 |
+
"db_level": null,
|
70 |
+
"power": 1.5,
|
71 |
+
"griffin_lim_iters": 60,
|
72 |
+
"num_mels": 80,
|
73 |
+
"mel_fmin": 0.0,
|
74 |
+
"mel_fmax": 8000,
|
75 |
+
"spec_gain": 1.0,
|
76 |
+
"do_amp_to_db_linear": true,
|
77 |
+
"do_amp_to_db_mel": true,
|
78 |
+
"pitch_fmax": 640.0,
|
79 |
+
"pitch_fmin": 0.0,
|
80 |
+
"signal_norm": false,
|
81 |
+
"min_level_db": -100,
|
82 |
+
"symmetric_norm": true,
|
83 |
+
"max_norm": 4.0,
|
84 |
+
"clip_norm": true,
|
85 |
+
"stats_path": null
|
86 |
+
},
|
87 |
+
"use_phonemes": false,
|
88 |
+
"phonemizer": null,
|
89 |
+
"phoneme_language": "ca-es",
|
90 |
+
"compute_input_seq_cache": true,
|
91 |
+
"text_cleaner": "multilingual_cleaners",
|
92 |
+
"enable_eos_bos_chars": false,
|
93 |
+
"test_sentences_file": "",
|
94 |
+
"phoneme_cache_path": null,
|
95 |
+
"characters": {
|
96 |
+
"characters_class": "TTS.tts.utils.text.characters.Graphemes",
|
97 |
+
"vocab_dict": null,
|
98 |
+
"pad": "_",
|
99 |
+
"eos": "*",
|
100 |
+
"bos": "^",
|
101 |
+
"blank": null,
|
102 |
+
"characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
|
103 |
+
"punctuations": "!'(),-.:;?\u00b7 ",
|
104 |
+
"phonemes": "",
|
105 |
+
"is_unique": true,
|
106 |
+
"is_sorted": true
|
107 |
+
},
|
108 |
+
"add_blank": false,
|
109 |
+
"batch_group_size": 0,
|
110 |
+
"loss_masking": null,
|
111 |
+
"min_audio_len": 1,
|
112 |
+
"max_audio_len": Infinity,
|
113 |
+
"min_text_len": 1,
|
114 |
+
"max_text_len": Infinity,
|
115 |
+
"compute_f0": true,
|
116 |
+
"compute_linear_spec": false,
|
117 |
+
"precompute_num_workers": 4,
|
118 |
+
"start_by_longest": false,
|
119 |
+
"datasets": [
|
120 |
+
{
|
121 |
+
"name": "custom_turkish",
|
122 |
+
"path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
|
123 |
+
"meta_file_train": "upc_ona_train.txt",
|
124 |
+
"ignored_speakers": null,
|
125 |
+
"language": "",
|
126 |
+
"meta_file_val": "",
|
127 |
+
"meta_file_attn_mask": ""
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"name": "custom_turkish",
|
131 |
+
"path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
|
132 |
+
"meta_file_train": "upc_ona_val.txt",
|
133 |
+
"ignored_speakers": null,
|
134 |
+
"language": "",
|
135 |
+
"meta_file_val": "",
|
136 |
+
"meta_file_attn_mask": ""
|
137 |
+
}
|
138 |
+
],
|
139 |
+
"test_sentences": [
|
140 |
+
"Hola Barcelona!",
|
141 |
+
"Escriviu al text."
|
142 |
+
],
|
143 |
+
"eval_split_max_size": null,
|
144 |
+
"eval_split_size": 0.01,
|
145 |
+
"use_speaker_weighted_sampler": false,
|
146 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
147 |
+
"use_language_weighted_sampler": false,
|
148 |
+
"language_weighted_sampler_alpha": 1.0,
|
149 |
+
"use_length_weighted_sampler": false,
|
150 |
+
"length_weighted_sampler_alpha": 1.0,
|
151 |
+
"base_model": "forward_tts",
|
152 |
+
"model_args": {
|
153 |
+
"num_chars": 89,
|
154 |
+
"out_channels": 80,
|
155 |
+
"hidden_channels": 384,
|
156 |
+
"use_aligner": true,
|
157 |
+
"use_pitch": true,
|
158 |
+
"pitch_predictor_hidden_channels": 256,
|
159 |
+
"pitch_predictor_kernel_size": 3,
|
160 |
+
"pitch_predictor_dropout_p": 0.1,
|
161 |
+
"pitch_embedding_kernel_size": 3,
|
162 |
+
"duration_predictor_hidden_channels": 256,
|
163 |
+
"duration_predictor_kernel_size": 3,
|
164 |
+
"duration_predictor_dropout_p": 0.1,
|
165 |
+
"positional_encoding": true,
|
166 |
+
"poisitonal_encoding_use_scale": true,
|
167 |
+
"length_scale": 1,
|
168 |
+
"encoder_type": "fftransformer",
|
169 |
+
"encoder_params": {
|
170 |
+
"hidden_channels_ffn": 1024,
|
171 |
+
"num_heads": 1,
|
172 |
+
"num_layers": 6,
|
173 |
+
"dropout_p": 0.1
|
174 |
+
},
|
175 |
+
"decoder_type": "fftransformer",
|
176 |
+
"decoder_params": {
|
177 |
+
"hidden_channels_ffn": 1024,
|
178 |
+
"num_heads": 1,
|
179 |
+
"num_layers": 6,
|
180 |
+
"dropout_p": 0.1
|
181 |
+
},
|
182 |
+
"detach_duration_predictor": false,
|
183 |
+
"max_duration": 75,
|
184 |
+
"num_speakers": 1,
|
185 |
+
"use_speaker_embedding": false,
|
186 |
+
"speakers_file": null,
|
187 |
+
"use_d_vector_file": false,
|
188 |
+
"d_vector_dim": null,
|
189 |
+
"d_vector_file": null
|
190 |
+
},
|
191 |
+
"num_speakers": 0,
|
192 |
+
"speakers_file": null,
|
193 |
+
"use_speaker_embedding": false,
|
194 |
+
"use_d_vector_file": false,
|
195 |
+
"d_vector_file": false,
|
196 |
+
"d_vector_dim": 0,
|
197 |
+
"spec_loss_type": "mse",
|
198 |
+
"duration_loss_type": "mse",
|
199 |
+
"use_ssim_loss": true,
|
200 |
+
"ssim_loss_alpha": 1.0,
|
201 |
+
"spec_loss_alpha": 1.0,
|
202 |
+
"aligner_loss_alpha": 1.0,
|
203 |
+
"pitch_loss_alpha": 0.1,
|
204 |
+
"dur_loss_alpha": 0.1,
|
205 |
+
"binary_align_loss_alpha": 0.1,
|
206 |
+
"binary_loss_warmup_epochs": 150,
|
207 |
+
"min_seq_len": 13,
|
208 |
+
"max_seq_len": 500000,
|
209 |
+
"r": 1,
|
210 |
+
"f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
|
211 |
+
"restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
|
212 |
+
"github_branch": "* dev"
|
213 |
+
}
|
models/collectivat/ljspeech--hifigan_v2_config.json
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"run_name": "hifigan",
|
3 |
+
"run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
|
4 |
+
|
5 |
+
|
6 |
+
// AUDIO PARAMETERS
|
7 |
+
"audio":{
|
8 |
+
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
9 |
+
"win_length": 1024, // stft window length in ms.
|
10 |
+
"hop_length": 256, // stft window hop-lengh in ms.
|
11 |
+
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
12 |
+
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
13 |
+
|
14 |
+
// Audio processing parameters
|
15 |
+
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
16 |
+
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
17 |
+
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
18 |
+
"log_func": "np.log",
|
19 |
+
|
20 |
+
// Silence trimming
|
21 |
+
"do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
22 |
+
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
23 |
+
|
24 |
+
// MelSpectrogram parameters
|
25 |
+
"num_mels": 80, // size of the mel spec frame.
|
26 |
+
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
27 |
+
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
28 |
+
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
29 |
+
|
30 |
+
// Normalization parameters
|
31 |
+
"signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
32 |
+
"min_level_db": -100, // lower bound for normalization
|
33 |
+
"symmetric_norm": true, // move normalization to range [-1, 1]
|
34 |
+
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
35 |
+
"clip_norm": true, // clip normalized values into the range.
|
36 |
+
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
37 |
+
},
|
38 |
+
|
39 |
+
// DISTRIBUTED TRAINING
|
40 |
+
"distributed":{
|
41 |
+
"backend": "nccl",
|
42 |
+
"url": "tcp:\/\/localhost:54324"
|
43 |
+
},
|
44 |
+
|
45 |
+
// MODEL PARAMETERS
|
46 |
+
"use_pqmf": false,
|
47 |
+
|
48 |
+
// LOSS PARAMETERS
|
49 |
+
"use_stft_loss": false,
|
50 |
+
"use_subband_stft_loss": false,
|
51 |
+
"use_mse_gan_loss": true,
|
52 |
+
"use_hinge_gan_loss": false,
|
53 |
+
"use_feat_match_loss": true, // use only with melgan discriminators
|
54 |
+
"use_l1_spec_loss": true,
|
55 |
+
|
56 |
+
// loss weights
|
57 |
+
"stft_loss_weight": 0,
|
58 |
+
"subband_stft_loss_weight": 0,
|
59 |
+
"mse_G_loss_weight": 1,
|
60 |
+
"hinge_G_loss_weight": 0,
|
61 |
+
"feat_match_loss_weight": 10,
|
62 |
+
"l1_spec_loss_weight": 45,
|
63 |
+
|
64 |
+
// multiscale stft loss parameters
|
65 |
+
// "stft_loss_params": {
|
66 |
+
// "n_ffts": [1024, 2048, 512],
|
67 |
+
// "hop_lengths": [120, 240, 50],
|
68 |
+
// "win_lengths": [600, 1200, 240]
|
69 |
+
// },
|
70 |
+
|
71 |
+
"l1_spec_loss_params": {
|
72 |
+
"use_mel": true,
|
73 |
+
"sample_rate": 16000,
|
74 |
+
"n_fft": 1024,
|
75 |
+
"hop_length": 256,
|
76 |
+
"win_length": 1024,
|
77 |
+
"n_mels": 80,
|
78 |
+
"mel_fmin": 0.0,
|
79 |
+
"mel_fmax": null
|
80 |
+
},
|
81 |
+
|
82 |
+
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
|
83 |
+
|
84 |
+
// DISCRIMINATOR
|
85 |
+
"discriminator_model": "hifigan_discriminator",
|
86 |
+
//"discriminator_model_params":{
|
87 |
+
// "peroids": [2, 3, 5, 7, 11],
|
88 |
+
// "base_channels": 16,
|
89 |
+
// "max_channels":512,
|
90 |
+
// "downsample_factors":[4, 4, 4]
|
91 |
+
//},
|
92 |
+
"steps_to_start_discriminator": 0, // steps required to start GAN trainining.1
|
93 |
+
|
94 |
+
// GENERATOR
|
95 |
+
"generator_model": "hifigan_generator",
|
96 |
+
"generator_model_params": {
|
97 |
+
"resblock_type": "1",
|
98 |
+
"upsample_factors": [8,8,2,2],
|
99 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
100 |
+
"upsample_initial_channel": 128,
|
101 |
+
"resblock_kernel_sizes": [3,7,11],
|
102 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
|
103 |
+
},
|
104 |
+
|
105 |
+
// DATASET
|
106 |
+
"data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
|
107 |
+
"feature_path": null,
|
108 |
+
// "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
|
109 |
+
"seq_len": 8192,
|
110 |
+
"pad_short": 2000,
|
111 |
+
"conv_pad": 0,
|
112 |
+
"use_noise_augment": false,
|
113 |
+
"use_cache": true,
|
114 |
+
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
115 |
+
|
116 |
+
// TRAINING
|
117 |
+
"batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
118 |
+
|
119 |
+
// VALIDATION
|
120 |
+
"run_eval": true,
|
121 |
+
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
122 |
+
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
123 |
+
|
124 |
+
// OPTIMIZER
|
125 |
+
"epochs": 10000, // total number of epochs to train.
|
126 |
+
"wd": 0.0, // Weight decay weight.
|
127 |
+
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
128 |
+
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
|
129 |
+
// "lr_scheduler_gen": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
130 |
+
// "lr_scheduler_gen_params": {
|
131 |
+
// "gamma": 0.999,
|
132 |
+
// "last_epoch": -1
|
133 |
+
// },
|
134 |
+
// "lr_scheduler_disc": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
135 |
+
// "lr_scheduler_disc_params": {
|
136 |
+
// "gamma": 0.999,
|
137 |
+
// "last_epoch": -1
|
138 |
+
// },
|
139 |
+
"lr_gen": 0.00001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
140 |
+
"lr_disc": 0.00001,
|
141 |
+
|
142 |
+
// TENSORBOARD and LOGGING
|
143 |
+
"print_step": 25, // Number of steps to log traning on console.
|
144 |
+
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
145 |
+
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
146 |
+
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
147 |
+
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
148 |
+
|
149 |
+
// DATA LOADING
|
150 |
+
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
151 |
+
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
152 |
+
"eval_split_size": 10,
|
153 |
+
|
154 |
+
// PATHS
|
155 |
+
"output_path": "/home/erogol/gdrive/Trainings/sam/"
|
156 |
+
}
|
157 |
+
|
158 |
+
|
models/collectivat/ljspeech--hifigan_v2_model_file.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
|
3 |
+
size 3794153
|
models/mms/G_100000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0382edd70333f8ddc663177e672c8a66312e1b30f7929a8f9d458ef66f6b5349
|
3 |
+
size 436622793
|
models/mms/config.json
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"epochs": 20000,
|
7 |
+
"learning_rate": 0.0002,
|
8 |
+
"betas": [
|
9 |
+
0.8,
|
10 |
+
0.99
|
11 |
+
],
|
12 |
+
"eps": 1e-09,
|
13 |
+
"batch_size": 64,
|
14 |
+
"fp16_run": true,
|
15 |
+
"lr_decay": 0.999875,
|
16 |
+
"segment_size": 8192,
|
17 |
+
"init_lr_ratio": 1,
|
18 |
+
"warmup_epochs": 0,
|
19 |
+
"c_mel": 45,
|
20 |
+
"c_kl": 1.0
|
21 |
+
},
|
22 |
+
"data": {
|
23 |
+
"training_files": "train.ltr",
|
24 |
+
"validation_files": "dev.ltr",
|
25 |
+
"text_cleaners": [
|
26 |
+
"transliteration_cleaners"
|
27 |
+
],
|
28 |
+
"max_wav_value": 32768.0,
|
29 |
+
"sampling_rate": 16000,
|
30 |
+
"filter_length": 1024,
|
31 |
+
"hop_length": 256,
|
32 |
+
"win_length": 1024,
|
33 |
+
"n_mel_channels": 80,
|
34 |
+
"mel_fmin": 0.0,
|
35 |
+
"mel_fmax": null,
|
36 |
+
"add_blank": true,
|
37 |
+
"n_speakers": 0,
|
38 |
+
"cleaned_text": true
|
39 |
+
},
|
40 |
+
"model": {
|
41 |
+
"inter_channels": 192,
|
42 |
+
"hidden_channels": 192,
|
43 |
+
"filter_channels": 768,
|
44 |
+
"n_heads": 2,
|
45 |
+
"n_layers": 6,
|
46 |
+
"kernel_size": 3,
|
47 |
+
"p_dropout": 0.1,
|
48 |
+
"resblock": "1",
|
49 |
+
"resblock_kernel_sizes": [
|
50 |
+
3,
|
51 |
+
7,
|
52 |
+
11
|
53 |
+
],
|
54 |
+
"resblock_dilation_sizes": [
|
55 |
+
[
|
56 |
+
1,
|
57 |
+
3,
|
58 |
+
5
|
59 |
+
],
|
60 |
+
[
|
61 |
+
1,
|
62 |
+
3,
|
63 |
+
5
|
64 |
+
],
|
65 |
+
[
|
66 |
+
1,
|
67 |
+
3,
|
68 |
+
5
|
69 |
+
]
|
70 |
+
],
|
71 |
+
"upsample_rates": [
|
72 |
+
8,
|
73 |
+
8,
|
74 |
+
2,
|
75 |
+
2
|
76 |
+
],
|
77 |
+
"upsample_initial_channel": 512,
|
78 |
+
"upsample_kernel_sizes": [
|
79 |
+
16,
|
80 |
+
16,
|
81 |
+
4,
|
82 |
+
4
|
83 |
+
],
|
84 |
+
"n_layers_q": 3,
|
85 |
+
"use_spectral_norm": false
|
86 |
+
}
|
87 |
+
}
|
models/mms/vocab.txt
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
z
|
2 |
+
f
|
3 |
+
i
|
4 |
+
g
|
5 |
+
m
|
6 |
+
o
|
7 |
+
r
|
8 |
+
è
|
9 |
+
h
|
10 |
+
l
|
11 |
+
v
|
12 |
+
à
|
13 |
+
u
|
14 |
+
d
|
15 |
+
ú
|
16 |
+
ç
|
17 |
+
p
|
18 |
+
s
|
19 |
+
'
|
20 |
+
é
|
21 |
+
_
|
22 |
+
-
|
23 |
+
e
|
24 |
+
a
|
25 |
+
—
|
26 |
+
x
|
27 |
+
ü
|
28 |
+
q
|
29 |
+
t
|
30 |
+
b
|
31 |
+
í
|
32 |
+
ó
|
33 |
+
ï
|
34 |
+
ò
|
35 |
+
|
36 |
+
c
|
37 |
+
j
|
38 |
+
n
|
39 |
+
y
|
models/piper/MODEL_CARD
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model card for upc_ona (x-low)
|
2 |
+
|
3 |
+
* Language: ca (Catalan)
|
4 |
+
* Speakers: 1
|
5 |
+
* Quality: x-low
|
6 |
+
* Samplerate: 16,000Hz
|
7 |
+
|
8 |
+
## Dataset
|
9 |
+
|
10 |
+
* URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
|
11 |
+
* License: CC BY-SA 3.0 ES
|
12 |
+
|
13 |
+
## Training
|
14 |
+
|
15 |
+
Trained from scratch.
|
models/piper/ca-upc_ona-x-low.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
|
3 |
+
size 20628813
|
models/piper/ca-upc_ona-x-low.onnx.json
ADDED
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio": {
|
3 |
+
"sample_rate": 16000
|
4 |
+
},
|
5 |
+
"espeak": {
|
6 |
+
"voice": "ca"
|
7 |
+
},
|
8 |
+
"inference": {
|
9 |
+
"noise_scale": 0.667,
|
10 |
+
"length_scale": 1,
|
11 |
+
"noise_w": 0.8
|
12 |
+
},
|
13 |
+
"phoneme_map": {},
|
14 |
+
"phoneme_id_map": {
|
15 |
+
"_": [
|
16 |
+
0
|
17 |
+
],
|
18 |
+
"^": [
|
19 |
+
1
|
20 |
+
],
|
21 |
+
"$": [
|
22 |
+
2
|
23 |
+
],
|
24 |
+
" ": [
|
25 |
+
3
|
26 |
+
],
|
27 |
+
"!": [
|
28 |
+
4
|
29 |
+
],
|
30 |
+
"'": [
|
31 |
+
5
|
32 |
+
],
|
33 |
+
"(": [
|
34 |
+
6
|
35 |
+
],
|
36 |
+
")": [
|
37 |
+
7
|
38 |
+
],
|
39 |
+
",": [
|
40 |
+
8
|
41 |
+
],
|
42 |
+
"-": [
|
43 |
+
9
|
44 |
+
],
|
45 |
+
".": [
|
46 |
+
10
|
47 |
+
],
|
48 |
+
":": [
|
49 |
+
11
|
50 |
+
],
|
51 |
+
";": [
|
52 |
+
12
|
53 |
+
],
|
54 |
+
"?": [
|
55 |
+
13
|
56 |
+
],
|
57 |
+
"a": [
|
58 |
+
14
|
59 |
+
],
|
60 |
+
"b": [
|
61 |
+
15
|
62 |
+
],
|
63 |
+
"c": [
|
64 |
+
16
|
65 |
+
],
|
66 |
+
"d": [
|
67 |
+
17
|
68 |
+
],
|
69 |
+
"e": [
|
70 |
+
18
|
71 |
+
],
|
72 |
+
"f": [
|
73 |
+
19
|
74 |
+
],
|
75 |
+
"h": [
|
76 |
+
20
|
77 |
+
],
|
78 |
+
"i": [
|
79 |
+
21
|
80 |
+
],
|
81 |
+
"j": [
|
82 |
+
22
|
83 |
+
],
|
84 |
+
"k": [
|
85 |
+
23
|
86 |
+
],
|
87 |
+
"l": [
|
88 |
+
24
|
89 |
+
],
|
90 |
+
"m": [
|
91 |
+
25
|
92 |
+
],
|
93 |
+
"n": [
|
94 |
+
26
|
95 |
+
],
|
96 |
+
"o": [
|
97 |
+
27
|
98 |
+
],
|
99 |
+
"p": [
|
100 |
+
28
|
101 |
+
],
|
102 |
+
"q": [
|
103 |
+
29
|
104 |
+
],
|
105 |
+
"r": [
|
106 |
+
30
|
107 |
+
],
|
108 |
+
"s": [
|
109 |
+
31
|
110 |
+
],
|
111 |
+
"t": [
|
112 |
+
32
|
113 |
+
],
|
114 |
+
"u": [
|
115 |
+
33
|
116 |
+
],
|
117 |
+
"v": [
|
118 |
+
34
|
119 |
+
],
|
120 |
+
"w": [
|
121 |
+
35
|
122 |
+
],
|
123 |
+
"x": [
|
124 |
+
36
|
125 |
+
],
|
126 |
+
"y": [
|
127 |
+
37
|
128 |
+
],
|
129 |
+
"z": [
|
130 |
+
38
|
131 |
+
],
|
132 |
+
"æ": [
|
133 |
+
39
|
134 |
+
],
|
135 |
+
"ç": [
|
136 |
+
40
|
137 |
+
],
|
138 |
+
"ð": [
|
139 |
+
41
|
140 |
+
],
|
141 |
+
"ø": [
|
142 |
+
42
|
143 |
+
],
|
144 |
+
"ħ": [
|
145 |
+
43
|
146 |
+
],
|
147 |
+
"ŋ": [
|
148 |
+
44
|
149 |
+
],
|
150 |
+
"œ": [
|
151 |
+
45
|
152 |
+
],
|
153 |
+
"ǀ": [
|
154 |
+
46
|
155 |
+
],
|
156 |
+
"ǁ": [
|
157 |
+
47
|
158 |
+
],
|
159 |
+
"ǂ": [
|
160 |
+
48
|
161 |
+
],
|
162 |
+
"ǃ": [
|
163 |
+
49
|
164 |
+
],
|
165 |
+
"ɐ": [
|
166 |
+
50
|
167 |
+
],
|
168 |
+
"ɑ": [
|
169 |
+
51
|
170 |
+
],
|
171 |
+
"ɒ": [
|
172 |
+
52
|
173 |
+
],
|
174 |
+
"ɓ": [
|
175 |
+
53
|
176 |
+
],
|
177 |
+
"ɔ": [
|
178 |
+
54
|
179 |
+
],
|
180 |
+
"ɕ": [
|
181 |
+
55
|
182 |
+
],
|
183 |
+
"ɖ": [
|
184 |
+
56
|
185 |
+
],
|
186 |
+
"ɗ": [
|
187 |
+
57
|
188 |
+
],
|
189 |
+
"ɘ": [
|
190 |
+
58
|
191 |
+
],
|
192 |
+
"ə": [
|
193 |
+
59
|
194 |
+
],
|
195 |
+
"ɚ": [
|
196 |
+
60
|
197 |
+
],
|
198 |
+
"ɛ": [
|
199 |
+
61
|
200 |
+
],
|
201 |
+
"ɜ": [
|
202 |
+
62
|
203 |
+
],
|
204 |
+
"ɞ": [
|
205 |
+
63
|
206 |
+
],
|
207 |
+
"ɟ": [
|
208 |
+
64
|
209 |
+
],
|
210 |
+
"ɠ": [
|
211 |
+
65
|
212 |
+
],
|
213 |
+
"ɡ": [
|
214 |
+
66
|
215 |
+
],
|
216 |
+
"ɢ": [
|
217 |
+
67
|
218 |
+
],
|
219 |
+
"ɣ": [
|
220 |
+
68
|
221 |
+
],
|
222 |
+
"ɤ": [
|
223 |
+
69
|
224 |
+
],
|
225 |
+
"ɥ": [
|
226 |
+
70
|
227 |
+
],
|
228 |
+
"ɦ": [
|
229 |
+
71
|
230 |
+
],
|
231 |
+
"ɧ": [
|
232 |
+
72
|
233 |
+
],
|
234 |
+
"ɨ": [
|
235 |
+
73
|
236 |
+
],
|
237 |
+
"ɪ": [
|
238 |
+
74
|
239 |
+
],
|
240 |
+
"ɫ": [
|
241 |
+
75
|
242 |
+
],
|
243 |
+
"ɬ": [
|
244 |
+
76
|
245 |
+
],
|
246 |
+
"ɭ": [
|
247 |
+
77
|
248 |
+
],
|
249 |
+
"ɮ": [
|
250 |
+
78
|
251 |
+
],
|
252 |
+
"ɯ": [
|
253 |
+
79
|
254 |
+
],
|
255 |
+
"ɰ": [
|
256 |
+
80
|
257 |
+
],
|
258 |
+
"ɱ": [
|
259 |
+
81
|
260 |
+
],
|
261 |
+
"ɲ": [
|
262 |
+
82
|
263 |
+
],
|
264 |
+
"ɳ": [
|
265 |
+
83
|
266 |
+
],
|
267 |
+
"ɴ": [
|
268 |
+
84
|
269 |
+
],
|
270 |
+
"ɵ": [
|
271 |
+
85
|
272 |
+
],
|
273 |
+
"ɶ": [
|
274 |
+
86
|
275 |
+
],
|
276 |
+
"ɸ": [
|
277 |
+
87
|
278 |
+
],
|
279 |
+
"ɹ": [
|
280 |
+
88
|
281 |
+
],
|
282 |
+
"ɺ": [
|
283 |
+
89
|
284 |
+
],
|
285 |
+
"ɻ": [
|
286 |
+
90
|
287 |
+
],
|
288 |
+
"ɽ": [
|
289 |
+
91
|
290 |
+
],
|
291 |
+
"ɾ": [
|
292 |
+
92
|
293 |
+
],
|
294 |
+
"ʀ": [
|
295 |
+
93
|
296 |
+
],
|
297 |
+
"ʁ": [
|
298 |
+
94
|
299 |
+
],
|
300 |
+
"ʂ": [
|
301 |
+
95
|
302 |
+
],
|
303 |
+
"ʃ": [
|
304 |
+
96
|
305 |
+
],
|
306 |
+
"ʄ": [
|
307 |
+
97
|
308 |
+
],
|
309 |
+
"ʈ": [
|
310 |
+
98
|
311 |
+
],
|
312 |
+
"ʉ": [
|
313 |
+
99
|
314 |
+
],
|
315 |
+
"ʊ": [
|
316 |
+
100
|
317 |
+
],
|
318 |
+
"ʋ": [
|
319 |
+
101
|
320 |
+
],
|
321 |
+
"ʌ": [
|
322 |
+
102
|
323 |
+
],
|
324 |
+
"ʍ": [
|
325 |
+
103
|
326 |
+
],
|
327 |
+
"ʎ": [
|
328 |
+
104
|
329 |
+
],
|
330 |
+
"ʏ": [
|
331 |
+
105
|
332 |
+
],
|
333 |
+
"ʐ": [
|
334 |
+
106
|
335 |
+
],
|
336 |
+
"ʑ": [
|
337 |
+
107
|
338 |
+
],
|
339 |
+
"ʒ": [
|
340 |
+
108
|
341 |
+
],
|
342 |
+
"ʔ": [
|
343 |
+
109
|
344 |
+
],
|
345 |
+
"ʕ": [
|
346 |
+
110
|
347 |
+
],
|
348 |
+
"ʘ": [
|
349 |
+
111
|
350 |
+
],
|
351 |
+
"ʙ": [
|
352 |
+
112
|
353 |
+
],
|
354 |
+
"ʛ": [
|
355 |
+
113
|
356 |
+
],
|
357 |
+
"ʜ": [
|
358 |
+
114
|
359 |
+
],
|
360 |
+
"ʝ": [
|
361 |
+
115
|
362 |
+
],
|
363 |
+
"ʟ": [
|
364 |
+
116
|
365 |
+
],
|
366 |
+
"ʡ": [
|
367 |
+
117
|
368 |
+
],
|
369 |
+
"ʢ": [
|
370 |
+
118
|
371 |
+
],
|
372 |
+
"ʲ": [
|
373 |
+
119
|
374 |
+
],
|
375 |
+
"ˈ": [
|
376 |
+
120
|
377 |
+
],
|
378 |
+
"ˌ": [
|
379 |
+
121
|
380 |
+
],
|
381 |
+
"ː": [
|
382 |
+
122
|
383 |
+
],
|
384 |
+
"ˑ": [
|
385 |
+
123
|
386 |
+
],
|
387 |
+
"˞": [
|
388 |
+
124
|
389 |
+
],
|
390 |
+
"β": [
|
391 |
+
125
|
392 |
+
],
|
393 |
+
"θ": [
|
394 |
+
126
|
395 |
+
],
|
396 |
+
"χ": [
|
397 |
+
127
|
398 |
+
],
|
399 |
+
"ᵻ": [
|
400 |
+
128
|
401 |
+
],
|
402 |
+
"ⱱ": [
|
403 |
+
129
|
404 |
+
]
|
405 |
+
},
|
406 |
+
"num_symbols": 130,
|
407 |
+
"num_speakers": 1,
|
408 |
+
"speaker_id_map": {}
|
409 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/coqui-ai/TTS@dev#egg=TTS
|
2 |
+
gradio
|
3 |
+
espeak-phonemizer>=1.1.0,<2
|
4 |
+
onnxruntime~=1.11.0
|