|
import os |
|
import re |
|
from pathlib import Path |
|
|
|
import jieba |
|
from omegaconf import OmegaConf |
|
|
|
from ipa.convert_digits import parse_num |
|
from ipa.proc_text import ( |
|
apply_v2f, |
|
normalize_text, |
|
prep_regex, |
|
run_jieba, |
|
update_jieba_dict, |
|
) |
|
|
|
ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml")) |
|
for key in ipa_configs["preserved_list"]: |
|
ipa_configs["v2f_dict"].pop(key, None) |
|
delimiter_regex, replace_regex, v2f_regex = prep_regex( |
|
ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"] |
|
) |
|
|
|
|
|
def get_ipa(raw_text: str, dialect: str) -> tuple[str, str, str, list[str]]: |
|
pinyin_split = re.split( |
|
r"([a-z]+\d+)", raw_text |
|
) |
|
|
|
final_words = [] |
|
final_pinyin = [] |
|
final_ipa = [] |
|
final_missing_words = [] |
|
for hanzi_or_pinyin in pinyin_split: |
|
if len(hanzi_or_pinyin.strip()) == 0: |
|
continue |
|
|
|
if re.search(r"[a-z]+\d+", hanzi_or_pinyin): |
|
final_words.append(hanzi_or_pinyin) |
|
final_pinyin.append(hanzi_or_pinyin) |
|
pinyin, tone = re.match(r"([a-z]+)(\d+)?", hanzi_or_pinyin).groups() |
|
tone = f"_{tone}" if tone else "" |
|
|
|
ipa = parse_pinyin_to_ipa(pinyin) |
|
if ipa is None: |
|
final_missing_words.append(pinyin) |
|
continue |
|
|
|
final_ipa.append(ipa + tone) |
|
else: |
|
words, ipa, pinyin, missing_words = parse_hanzi_to_ipa( |
|
hanzi_or_pinyin, dialect |
|
) |
|
final_words.extend(words) |
|
final_ipa.extend(ipa) |
|
final_pinyin.extend(pinyin) |
|
final_missing_words.extend(missing_words) |
|
|
|
if len(final_ipa) == 0 or len(final_missing_words) > 0: |
|
return final_words, final_ipa, final_pinyin, final_missing_words |
|
|
|
final_words = " ".join(final_words).replace(" , ", ",") |
|
final_ipa = " ".join(final_ipa).replace(" , ", ",") |
|
final_pinyin = " ".join(final_pinyin).replace(" , ", ",") |
|
|
|
return final_words, final_ipa, final_pinyin, final_missing_words |
|
|
|
|
|
def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space="")->list[str]: |
|
text = [] |
|
|
|
ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa) |
|
print(ipa_list) |
|
for word in ipa_list: |
|
if word.isdigit(): |
|
text.append(word) |
|
else: |
|
if len(as_space) > 0: |
|
word = re.sub(r"[{}]".format(as_space), " ", word) |
|
if len(delete_chars) > 0: |
|
word = re.sub(r"[{}]".format(delete_chars), "", word) |
|
|
|
word = word.replace(",", " , ") |
|
text.extend(word) |
|
|
|
return text |
|
|
|
|
|
def parse_pinyin_to_ipa(pinyin: str)->str|None: |
|
if pinyin not in ipa_configs["pinyin_to_ipa_dict"]: |
|
return None |
|
|
|
ipa_dict_result = ipa_configs["pinyin_to_ipa_dict"][pinyin] |
|
ipa = "+".join(ipa_dict_result).replace(" ", "-") |
|
return ipa |
|
|
|
|
|
def parse_hanzi_to_ipa( |
|
hanzi: str, dialect: str |
|
) -> tuple[list[str], list[str], list[str], list[str]]: |
|
lexicon = ipa_configs["lexicon"][dialect] |
|
update_jieba_dict( |
|
list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt" |
|
) |
|
|
|
text = normalize_text(hanzi, ipa_configs["replace_dict"], replace_regex) |
|
text = parse_num(text) |
|
text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()] |
|
text = ",".join(text_parts) |
|
word_list = run_jieba(text) |
|
word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex) |
|
word_list = run_jieba("".join(word_list)) |
|
|
|
final_words = [] |
|
final_pinyin = [] |
|
final_ipa = [] |
|
missing_words = [] |
|
for word in word_list: |
|
if not bool(word.strip()): |
|
continue |
|
if word == ",": |
|
final_words.append(",") |
|
final_pinyin.append(",") |
|
final_ipa.append(",") |
|
elif word not in lexicon: |
|
final_words.append(word) |
|
missing_words.append(word) |
|
else: |
|
final_words.append(f"{word}") |
|
final_pinyin.append(lexicon[word]["pinyin"][0]) |
|
|
|
final_ipa.append(lexicon[word]["ipa"][0]) |
|
|
|
return final_words, final_ipa, final_pinyin, missing_words |
|
|