Spaces:
Runtime error
Runtime error
NTT123
commited on
Commit
•
3dbfd73
1
Parent(s):
2157b01
Update tacotron model that uses phonemes instead of raw text.
Browse files- .gitattributes +1 -6
- .gitignore +1 -0
- alphabet.txt +26 -10
- app.py +5 -1
- inference.py +9 -0
- install_espeak_ng.sh +10 -0
- packages.txt +6 -0
- requirements.txt +2 -1
- tacotron.py +6 -2
- tacotron.toml +1 -0
- pretrained_model_ljs_600k.ckpt → tacotrons_ljs_24k_v1_0250000.ckpt +2 -2
.gitattributes
CHANGED
@@ -27,9 +27,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
bazelisk-linux-amd64 filter=lfs diff=lfs merge=lfs -text
|
29 |
wavegru_mod.so filter=lfs diff=lfs merge=lfs -text
|
30 |
-
|
31 |
-
wavegru_vocoder_1024_v3_1310000.ckpt filter=lfs diff=lfs merge=lfs -text
|
32 |
-
wavegru_vocoder_1024_v3_1330000.ckpt filter=lfs diff=lfs merge=lfs -text
|
33 |
-
wavegru_vocoder_1024_v3_1340000.ckpt filter=lfs diff=lfs merge=lfs -text
|
34 |
-
wavegru_vocoder_1024_v3_1360000.ckpt filter=lfs diff=lfs merge=lfs -text
|
35 |
-
wavegru_vocoder_1024_v3_1400000.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
bazelisk-linux-amd64 filter=lfs diff=lfs merge=lfs -text
|
29 |
wavegru_mod.so filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.venv
|
alphabet.txt
CHANGED
@@ -1,25 +1,18 @@
|
|
1 |
_
|
|
|
2 |
|
3 |
!
|
4 |
"
|
5 |
-
'
|
6 |
-
(
|
7 |
-
)
|
8 |
,
|
9 |
-
-
|
10 |
.
|
11 |
:
|
12 |
;
|
13 |
?
|
14 |
-
[
|
15 |
-
]
|
16 |
a
|
17 |
b
|
18 |
-
c
|
19 |
d
|
20 |
e
|
21 |
f
|
22 |
-
g
|
23 |
h
|
24 |
i
|
25 |
j
|
@@ -29,7 +22,6 @@ m
|
|
29 |
n
|
30 |
o
|
31 |
p
|
32 |
-
q
|
33 |
r
|
34 |
s
|
35 |
t
|
@@ -37,5 +29,29 @@ u
|
|
37 |
v
|
38 |
w
|
39 |
x
|
40 |
-
y
|
41 |
z
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
_
|
2 |
+
■
|
3 |
|
4 |
!
|
5 |
"
|
|
|
|
|
|
|
6 |
,
|
|
|
7 |
.
|
8 |
:
|
9 |
;
|
10 |
?
|
|
|
|
|
11 |
a
|
12 |
b
|
|
|
13 |
d
|
14 |
e
|
15 |
f
|
|
|
16 |
h
|
17 |
i
|
18 |
j
|
|
|
22 |
n
|
23 |
o
|
24 |
p
|
|
|
25 |
r
|
26 |
s
|
27 |
t
|
|
|
29 |
v
|
30 |
w
|
31 |
x
|
|
|
32 |
z
|
33 |
+
æ
|
34 |
+
ð
|
35 |
+
ŋ
|
36 |
+
ɐ
|
37 |
+
ɑ
|
38 |
+
ɔ
|
39 |
+
ə
|
40 |
+
ɚ
|
41 |
+
ɛ
|
42 |
+
ɜ
|
43 |
+
ɡ
|
44 |
+
ɪ
|
45 |
+
ɹ
|
46 |
+
ɾ
|
47 |
+
ʃ
|
48 |
+
ʊ
|
49 |
+
ʌ
|
50 |
+
ʒ
|
51 |
+
ʔ
|
52 |
+
ˈ
|
53 |
+
ˌ
|
54 |
+
ː
|
55 |
+
̩
|
56 |
+
θ
|
57 |
+
ᵻ
|
app.py
CHANGED
@@ -3,6 +3,10 @@
|
|
3 |
# os.system("./bazelisk-linux-amd64 clean --expunge")
|
4 |
# os.system("./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native")
|
5 |
|
|
|
|
|
|
|
|
|
6 |
|
7 |
import gradio as gr
|
8 |
from inference import load_tacotron_model, load_wavegru_net, mel_to_wav, text_to_mel
|
@@ -11,7 +15,7 @@ from wavegru_cpp import extract_weight_mask, load_wavegru_cpp
|
|
11 |
|
12 |
def speak(text):
|
13 |
alphabet, tacotron_net, tacotron_config = load_tacotron_model(
|
14 |
-
"./alphabet.txt", "./tacotron.toml", "./
|
15 |
)
|
16 |
|
17 |
wavegru_config, wavegru_net = load_wavegru_net(
|
|
|
3 |
# os.system("./bazelisk-linux-amd64 clean --expunge")
|
4 |
# os.system("./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native")
|
5 |
|
6 |
+
# install espeak
|
7 |
+
import os
|
8 |
+
|
9 |
+
os.system("bash ./install_espeak_ng.sh")
|
10 |
|
11 |
import gradio as gr
|
12 |
from inference import load_tacotron_model, load_wavegru_net, mel_to_wav, text_to_mel
|
|
|
15 |
|
16 |
def speak(text):
|
17 |
alphabet, tacotron_net, tacotron_config = load_tacotron_model(
|
18 |
+
"./alphabet.txt", "./tacotron.toml", "./tacotrons_ljs_24k_v1_0250000.ckpt"
|
19 |
)
|
20 |
|
21 |
wavegru_config, wavegru_net = load_wavegru_net(
|
inference.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
import jax
|
2 |
import jax.numpy as jnp
|
3 |
import librosa
|
@@ -14,6 +16,11 @@ from utils import (
|
|
14 |
)
|
15 |
from wavegru import WaveGRU
|
16 |
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def load_tacotron_model(alphabet_file, config_file, model_file):
|
19 |
"""load tacotron model to memory"""
|
@@ -34,6 +41,8 @@ tacotron_inference_fn = pax.pure(lambda net, text: net.inference(text, max_len=2
|
|
34 |
def text_to_mel(net, text, alphabet, config):
|
35 |
"""convert text to mel spectrogram"""
|
36 |
text = english_cleaners(text)
|
|
|
|
|
37 |
text = text + config["PAD"] * (100 - (len(text) % 100))
|
38 |
tokens = []
|
39 |
for c in text:
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
import jax
|
4 |
import jax.numpy as jnp
|
5 |
import librosa
|
|
|
16 |
)
|
17 |
from wavegru import WaveGRU
|
18 |
|
19 |
+
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = "./espeak/usr/lib/libespeak-ng.so.1.1.51"
|
20 |
+
from phonemizer.backend import EspeakBackend
|
21 |
+
|
22 |
+
backend = EspeakBackend("en-us", preserve_punctuation=True, with_stress=True)
|
23 |
+
|
24 |
|
25 |
def load_tacotron_model(alphabet_file, config_file, model_file):
|
26 |
"""load tacotron model to memory"""
|
|
|
41 |
def text_to_mel(net, text, alphabet, config):
|
42 |
"""convert text to mel spectrogram"""
|
43 |
text = english_cleaners(text)
|
44 |
+
text = backend.phonemize([text], strip=True)[0]
|
45 |
+
text = text + config["END_CHARACTER"]
|
46 |
text = text + config["PAD"] * (100 - (len(text) % 100))
|
47 |
tokens = []
|
48 |
for c in text:
|
install_espeak_ng.sh
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
rm -rf espeak
|
2 |
+
mkdir -p espeak
|
3 |
+
cd espeak
|
4 |
+
wget https://github.com/espeak-ng/espeak-ng/archive/refs/tags/1.51.zip
|
5 |
+
unzip -qq 1.51.zip
|
6 |
+
cd espeak-ng-1.51
|
7 |
+
./autogen.sh
|
8 |
+
./configure --prefix=`pwd`/../usr
|
9 |
+
make
|
10 |
+
make install
|
packages.txt
CHANGED
@@ -1 +1,7 @@
|
|
1 |
libsndfile1-dev
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
libsndfile1-dev
|
2 |
+
make
|
3 |
+
autoconf
|
4 |
+
automake
|
5 |
+
libtool
|
6 |
+
pkg-config
|
7 |
+
gcc
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ numpy==1.22.3
|
|
8 |
pax3==0.5.6
|
9 |
pyyaml==6.0
|
10 |
toml==0.10.2
|
11 |
-
unidecode==1.3.4
|
|
|
|
8 |
pax3==0.5.6
|
9 |
pyyaml==6.0
|
10 |
toml==0.10.2
|
11 |
+
unidecode==1.3.4
|
12 |
+
phonemizer==3.1.1
|
tacotron.py
CHANGED
@@ -371,7 +371,10 @@ class Tacotron(pax.Module):
|
|
371 |
x = x[:, : self.rr, :]
|
372 |
x = jnp.reshape(x, (N, self.rr, -1))
|
373 |
mel = x[..., :-1]
|
374 |
-
|
|
|
|
|
|
|
375 |
return attn_state, decoder_rnn_states, rng_key, (mel, eos)
|
376 |
|
377 |
def inference(self, text, seed=42, max_len=1000):
|
@@ -381,6 +384,7 @@ class Tacotron(pax.Module):
|
|
381 |
text = self.encode_text(text)
|
382 |
text_key = self.text_key_fc(text)
|
383 |
N, L, D = text.shape
|
|
|
384 |
mel = self.go_frame(N)
|
385 |
|
386 |
attn_state, decoder_rnn_states = self.decoder_initial_state(N, L)
|
@@ -393,7 +397,7 @@ class Tacotron(pax.Module):
|
|
393 |
attn_state, decoder_rnn_states, rng_key, mel, text, text_key
|
394 |
)
|
395 |
mels.append(mel)
|
396 |
-
if eos
|
397 |
break
|
398 |
|
399 |
mel = mel[:, -1, :]
|
|
|
371 |
x = x[:, : self.rr, :]
|
372 |
x = jnp.reshape(x, (N, self.rr, -1))
|
373 |
mel = x[..., :-1]
|
374 |
+
eos_logit = x[..., -1]
|
375 |
+
eos_pr = jax.nn.sigmoid(eos_logit[0, -1])
|
376 |
+
rng_key, eos_rng_key = jax.random.split(rng_key)
|
377 |
+
eos = jax.random.bernoulli(eos_rng_key, p=eos_pr)
|
378 |
return attn_state, decoder_rnn_states, rng_key, (mel, eos)
|
379 |
|
380 |
def inference(self, text, seed=42, max_len=1000):
|
|
|
384 |
text = self.encode_text(text)
|
385 |
text_key = self.text_key_fc(text)
|
386 |
N, L, D = text.shape
|
387 |
+
assert N == 1
|
388 |
mel = self.go_frame(N)
|
389 |
|
390 |
attn_state, decoder_rnn_states = self.decoder_initial_state(N, L)
|
|
|
397 |
attn_state, decoder_rnn_states, rng_key, mel, text, text_key
|
398 |
)
|
399 |
mels.append(mel)
|
400 |
+
if eos.item() or count > max_len:
|
401 |
break
|
402 |
|
403 |
mel = mel[:, -1, :]
|
tacotron.toml
CHANGED
@@ -16,6 +16,7 @@ MEL_DIM = 80 # the dimension of melspectrogram features
|
|
16 |
MEL_MIN = 1e-5
|
17 |
PAD = "_" # padding character
|
18 |
PAD_TOKEN = 0
|
|
|
19 |
TEST_DATA_SIZE = 1024
|
20 |
|
21 |
# model
|
|
|
16 |
MEL_MIN = 1e-5
|
17 |
PAD = "_" # padding character
|
18 |
PAD_TOKEN = 0
|
19 |
+
END_CHARACTER = "■" # to signal the end of the transcript
|
20 |
TEST_DATA_SIZE = 1024
|
21 |
|
22 |
# model
|
pretrained_model_ljs_600k.ckpt → tacotrons_ljs_24k_v1_0250000.ckpt
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:512b3af6ef95ccc53d3516256abae81b025e110fa886ec68f9f7033039013fc6
|
3 |
+
size 53561547
|