Podcastify / melo /text /spanish.py
mrfakename's picture
Init
4300fed
raw
history blame
3.18 kB
import pickle
import os
import re
from . import symbols
from .es_phonemizer import cleaner as es_cleaner
from .es_phonemizer import es_to_ipa
from transformers import AutoTokenizer
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
def text_normalize(text):
text = es_cleaner.spanish_cleaners(text)
return text
def post_replace_ph(ph):
rep_map = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"·": ",",
"、": ",",
"...": "…"
}
if ph in rep_map.keys():
ph = rep_map[ph]
if ph in symbols:
return ph
if ph not in symbols:
ph = "UNK"
return ph
def refine_ph(phn):
tone = 0
if re.search(r"\d$", phn):
tone = int(phn[-1]) + 1
phn = phn[:-1]
return phn.lower(), tone
def refine_syllables(syllables):
tones = []
phonemes = []
for phn_list in syllables:
for i in range(len(phn_list)):
phn = phn_list[i]
phn, tone = refine_ph(phn)
phonemes.append(phn)
tones.append(tone)
return phonemes, tones
# model_id = 'bert-base-uncased'
model_id = 'dccuchile/bert-base-spanish-wwm-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def g2p(text, pad_start_end=True, tokenized=None):
if tokenized is None:
tokenized = tokenizer.tokenize(text)
# import pdb; pdb.set_trace()
phs = []
ph_groups = []
for t in tokenized:
if not t.startswith("#"):
ph_groups.append([t])
else:
ph_groups[-1].append(t.replace("#", ""))
phones = []
tones = []
word2ph = []
# print(ph_groups)
for group in ph_groups:
w = "".join(group)
phone_len = 0
word_len = len(group)
if w == '[UNK]':
phone_list = ['UNK']
else:
phone_list = list(filter(lambda p: p != " ", es_to_ipa.es2ipa(w)))
for ph in phone_list:
phones.append(ph)
tones.append(0)
phone_len += 1
aaa = distribute_phone(phone_len, word_len)
word2ph += aaa
# print(phone_list, aaa)
# print('=' * 10)
if pad_start_end:
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
def get_bert_feature(text, word2ph, device=None):
from text import spanish_bert
return spanish_bert.get_bert_feature(text, word2ph, device=device)
if __name__ == "__main__":
text = "en nuestros tiempos estos dos pueblos ilustres empiezan a curarse, gracias sólo a la sana y vigorosa higiene de 1789."
# print(text)
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
print(phones)
print(len(phones), tones, sum(word2ph), bert.shape)