File size: 4,277 Bytes
5e8e534 fb0bb2b 5c3ce0e fb0bb2b 4a0fd18 5e8e534 fb0bb2b 5e8e534 fb0bb2b 4a0fd18 fb0bb2b 5e8e534 5c3ce0e fb0bb2b a3a5dd5 5e8e534 fb0bb2b 5e8e534 5c3ce0e 5e8e534 fb0bb2b 9b0e12a 5e8e534 9b0e12a fb0bb2b 9b0e12a fb0bb2b 5c3ce0e fb0bb2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import os
import re
from pathlib import Path
import jieba
from omegaconf import OmegaConf
from ipa.convert_digits import parse_num
from ipa.proc_text import (
apply_v2f,
normalize_text,
prep_regex,
run_jieba,
update_jieba_dict,
)
ipa_configs = OmegaConf.to_object(OmegaConf.load("configs/ipa.yaml"))
for key in ipa_configs["preserved_list"]:
ipa_configs["v2f_dict"].pop(key, None)
delimiter_regex, replace_regex, v2f_regex = prep_regex(
ipa_configs["delimiter_list"], ipa_configs["replace_dict"], ipa_configs["v2f_dict"]
)
def get_ipa(raw_text: str, dialect: str) -> tuple[str, str, str, list[str]]:
pinyin_split = re.split(
r"([a-z]+\d+)", raw_text
)
final_words = []
final_pinyin = []
final_ipa = []
final_missing_words = []
for hanzi_or_pinyin in pinyin_split:
if len(hanzi_or_pinyin.strip()) == 0:
continue
if re.search(r"[a-z]+\d+", hanzi_or_pinyin):
final_words.append(hanzi_or_pinyin)
final_pinyin.append(hanzi_or_pinyin)
pinyin, tone = re.match(r"([a-z]+)(\d+)?", hanzi_or_pinyin).groups()
tone = f"_{tone}" if tone else ""
ipa = parse_pinyin_to_ipa(pinyin)
if ipa is None:
final_missing_words.append(pinyin)
continue
final_ipa.append(ipa + tone)
else:
words, ipa, pinyin, missing_words = parse_hanzi_to_ipa(
hanzi_or_pinyin, dialect
)
final_words.extend(words)
final_ipa.extend(ipa)
final_pinyin.extend(pinyin)
final_missing_words.extend(missing_words)
if len(final_ipa) == 0 or len(final_missing_words) > 0:
return final_words, final_ipa, final_pinyin, final_missing_words
final_words = " ".join(final_words).replace(" , ", ",")
final_ipa = " ".join(final_ipa).replace(" , ", ",")
final_pinyin = " ".join(final_pinyin).replace(" , ", ",")
return final_words, final_ipa, final_pinyin, final_missing_words
def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space="")->list[str]:
text = []
ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
print(ipa_list)
for word in ipa_list:
if word.isdigit():
text.append(word)
else:
if len(as_space) > 0:
word = re.sub(r"[{}]".format(as_space), " ", word)
if len(delete_chars) > 0:
word = re.sub(r"[{}]".format(delete_chars), "", word)
word = word.replace(",", " , ")
text.extend(word)
return text
def parse_pinyin_to_ipa(pinyin: str)->str|None:
if pinyin not in ipa_configs["pinyin_to_ipa_dict"]:
return None
ipa_dict_result = ipa_configs["pinyin_to_ipa_dict"][pinyin]
ipa = "+".join(ipa_dict_result).replace(" ", "-")
return ipa
def parse_hanzi_to_ipa(
hanzi: str, dialect: str
) -> tuple[list[str], list[str], list[str], list[str]]:
lexicon = ipa_configs["lexicon"][dialect]
update_jieba_dict(
list(lexicon.keys()), Path(os.path.dirname(jieba.__file__)) / "dict.txt"
)
text = normalize_text(hanzi, ipa_configs["replace_dict"], replace_regex)
text = parse_num(text)
text_parts = [s.strip() for s in re.split(delimiter_regex, text) if s.strip()]
text = ",".join(text_parts)
word_list = run_jieba(text)
word_list = apply_v2f(word_list, ipa_configs["v2f_dict"], v2f_regex)
word_list = run_jieba("".join(word_list))
final_words = []
final_pinyin = []
final_ipa = []
missing_words = []
for word in word_list:
if not bool(word.strip()):
continue
if word == ",":
final_words.append(",")
final_pinyin.append(",")
final_ipa.append(",")
elif word not in lexicon:
final_words.append(word)
missing_words.append(word)
else:
final_words.append(f"{word}")
final_pinyin.append(lexicon[word]["pinyin"][0])
# NOTE 只有 lexicon[word] 中的第一個 ipa 才被考慮
final_ipa.append(lexicon[word]["ipa"][0])
return final_words, final_ipa, final_pinyin, missing_words
|