Spaces:
Running
on
T4
Running
on
T4
File size: 3,175 Bytes
4300fed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import pickle
import os
import re
from . import symbols
from .es_phonemizer import cleaner as es_cleaner
from .es_phonemizer import es_to_ipa
from transformers import AutoTokenizer
def distribute_phone(n_phone, n_word):
phones_per_word = [0] * n_word
for task in range(n_phone):
min_tasks = min(phones_per_word)
min_index = phones_per_word.index(min_tasks)
phones_per_word[min_index] += 1
return phones_per_word
def text_normalize(text):
text = es_cleaner.spanish_cleaners(text)
return text
def post_replace_ph(ph):
rep_map = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"·": ",",
"、": ",",
"...": "…"
}
if ph in rep_map.keys():
ph = rep_map[ph]
if ph in symbols:
return ph
if ph not in symbols:
ph = "UNK"
return ph
def refine_ph(phn):
tone = 0
if re.search(r"\d$", phn):
tone = int(phn[-1]) + 1
phn = phn[:-1]
return phn.lower(), tone
def refine_syllables(syllables):
tones = []
phonemes = []
for phn_list in syllables:
for i in range(len(phn_list)):
phn = phn_list[i]
phn, tone = refine_ph(phn)
phonemes.append(phn)
tones.append(tone)
return phonemes, tones
# model_id = 'bert-base-uncased'
model_id = 'dccuchile/bert-base-spanish-wwm-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_id)
def g2p(text, pad_start_end=True, tokenized=None):
if tokenized is None:
tokenized = tokenizer.tokenize(text)
# import pdb; pdb.set_trace()
phs = []
ph_groups = []
for t in tokenized:
if not t.startswith("#"):
ph_groups.append([t])
else:
ph_groups[-1].append(t.replace("#", ""))
phones = []
tones = []
word2ph = []
# print(ph_groups)
for group in ph_groups:
w = "".join(group)
phone_len = 0
word_len = len(group)
if w == '[UNK]':
phone_list = ['UNK']
else:
phone_list = list(filter(lambda p: p != " ", es_to_ipa.es2ipa(w)))
for ph in phone_list:
phones.append(ph)
tones.append(0)
phone_len += 1
aaa = distribute_phone(phone_len, word_len)
word2ph += aaa
# print(phone_list, aaa)
# print('=' * 10)
if pad_start_end:
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
def get_bert_feature(text, word2ph, device=None):
from text import spanish_bert
return spanish_bert.get_bert_feature(text, word2ph, device=device)
if __name__ == "__main__":
text = "en nuestros tiempos estos dos pueblos ilustres empiezan a curarse, gracias sólo a la sana y vigorosa higiene de 1789."
# print(text)
text = text_normalize(text)
print(text)
phones, tones, word2ph = g2p(text)
bert = get_bert_feature(text, word2ph)
print(phones)
print(len(phones), tones, sum(word2ph), bert.shape)
|