|
|
|
import re |
|
import sys |
|
|
|
import pyopenjtalk |
|
|
|
try: |
|
from text import symbols |
|
except: |
|
from symbols import symbols |
|
|
|
_japanese_characters = re.compile( |
|
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" |
|
) |
|
|
|
|
|
_japanese_marks = re.compile( |
|
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" |
|
) |
|
|
|
|
|
_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("οΌ
", "γγΌγ»γ³γ")]] |
|
|
|
|
|
|
|
_real_sokuon = [ |
|
(re.compile("%s" % x[0]), x[1]) |
|
for x in [ |
|
(r"Q([ββ]*[kg])", r"k#\1"), |
|
(r"Q([ββ]*[tdjΚ§])", r"t#\1"), |
|
(r"Q([ββ]*[sΚ])", r"s\1"), |
|
(r"Q([ββ]*[pb])", r"p#\1"), |
|
] |
|
] |
|
|
|
|
|
_real_hatsuon = [ |
|
(re.compile("%s" % x[0]), x[1]) |
|
for x in [ |
|
(r"N([ββ]*[pbm])", r"m\1"), |
|
(r"N([ββ]*[Κ§Κ₯j])", r"n^\1"), |
|
(r"N([ββ]*[tdn])", r"n\1"), |
|
(r"N([ββ]*[kg])", r"Ε\1"), |
|
] |
|
] |
|
|
|
|
|
def post_replace_ph(ph): |
|
rep_map = { |
|
"οΌ": ",", |
|
"οΌ": ",", |
|
"οΌ": ",", |
|
"γ": ".", |
|
"οΌ": "!", |
|
"οΌ": "?", |
|
"\n": ".", |
|
"Β·": ",", |
|
"γ": ",", |
|
"...": "β¦", |
|
} |
|
if ph in rep_map.keys(): |
|
ph = rep_map[ph] |
|
if ph in symbols: |
|
return ph |
|
if ph not in symbols: |
|
ph = "UNK" |
|
return ph |
|
|
|
|
|
def symbols_to_japanese(text): |
|
for regex, replacement in _symbols_to_japanese: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def preprocess_jap(text, with_prosody=False): |
|
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" |
|
text = symbols_to_japanese(text) |
|
sentences = re.split(_japanese_marks, text) |
|
marks = re.findall(_japanese_marks, text) |
|
text = [] |
|
for i, sentence in enumerate(sentences): |
|
if re.match(_japanese_characters, sentence): |
|
if with_prosody: |
|
text += pyopenjtalk_g2p_prosody(sentence)[1:-1] |
|
else: |
|
p = pyopenjtalk.g2p(sentence) |
|
text += p.split(" ") |
|
|
|
if i < len(marks): |
|
if marks[i] == " ": |
|
continue |
|
text += [marks[i].replace(" ", "")] |
|
return text |
|
|
|
|
|
def text_normalize(text): |
|
|
|
return text |
|
|
|
|
|
def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True): |
|
"""Extract phoneme + prosoody symbol sequence from input full-context labels. |
|
|
|
The algorithm is based on `Prosodic features control by symbols as input of |
|
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. |
|
|
|
Args: |
|
text (str): Input text. |
|
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. |
|
|
|
Returns: |
|
List[str]: List of phoneme + prosody symbols. |
|
|
|
Examples: |
|
>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody |
|
>>> pyopenjtalk_g2p_prosody("γγγ«γ‘γ―γ") |
|
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] |
|
|
|
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic |
|
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 |
|
|
|
""" |
|
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) |
|
N = len(labels) |
|
|
|
phones = [] |
|
for n in range(N): |
|
lab_curr = labels[n] |
|
|
|
|
|
p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) |
|
|
|
if drop_unvoiced_vowels and p3 in "AEIOU": |
|
p3 = p3.lower() |
|
|
|
|
|
if p3 == "sil": |
|
assert n == 0 or n == N - 1 |
|
if n == 0: |
|
phones.append("^") |
|
elif n == N - 1: |
|
|
|
e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) |
|
if e3 == 0: |
|
phones.append("$") |
|
elif e3 == 1: |
|
phones.append("?") |
|
continue |
|
elif p3 == "pau": |
|
phones.append("_") |
|
continue |
|
else: |
|
phones.append(p3) |
|
|
|
|
|
a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) |
|
a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) |
|
a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) |
|
|
|
|
|
f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) |
|
|
|
a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) |
|
|
|
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": |
|
phones.append("#") |
|
|
|
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: |
|
phones.append("]") |
|
|
|
elif a2 == 1 and a2_next == 2: |
|
phones.append("[") |
|
|
|
return phones |
|
|
|
|
|
def _numeric_feature_by_regex(regex, s): |
|
match = re.search(regex, s) |
|
if match is None: |
|
return -50 |
|
return int(match.group(1)) |
|
|
|
|
|
def g2p(norm_text, with_prosody=True): |
|
phones = preprocess_jap(norm_text, with_prosody) |
|
phones = [post_replace_ph(i) for i in phones] |
|
|
|
return phones |
|
|
|
|
|
if __name__ == "__main__": |
|
phones = g2p("γγγ«γ‘γ―, hello, AKITOγ§γ,γγγγγι‘γγγΎγγοΌ") |
|
print(phones) |