txya900619
commited on
Commit
·
9b0e12a
1
Parent(s):
08c0111
feat: update parse_ipa to fit new lexicon format
Browse files- ipa/ipa.py +15 -20
ipa/ipa.py
CHANGED
@@ -63,26 +63,21 @@ def get_ipa(raw_text, dialect):
|
|
63 |
|
64 |
return final_words, final_ipa, final_pinyin, missing_words
|
65 |
|
66 |
-
def parse_ipa(ipa: str):
|
67 |
text = []
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
elif phoneme_with_tone == ",":
|
75 |
-
text.extend(" , ")
|
76 |
-
continue
|
77 |
-
elif phoneme_with_tone == "-": # use " " split 詞 (or use " " to split 字)
|
78 |
-
continue
|
79 |
-
|
80 |
-
split_phoneme_and_tone = phoneme_with_tone.split("_")
|
81 |
-
|
82 |
-
if len(split_phoneme_and_tone) == 2:
|
83 |
-
phoneme, tone = split_phoneme_and_tone
|
84 |
-
text.extend(phoneme)
|
85 |
-
text.append(tone)
|
86 |
else:
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
return text
|
|
|
63 |
|
64 |
return final_words, final_ipa, final_pinyin, missing_words
|
65 |
|
66 |
+
def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
|
67 |
text = []
|
68 |
+
|
69 |
+
ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
|
70 |
+
print(ipa_list)
|
71 |
+
for word in ipa_list:
|
72 |
+
if word.isdigit():
|
73 |
+
text.append(word)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
else:
|
75 |
+
if len(as_space) > 0:
|
76 |
+
word = re.sub(r"[{}]".format(as_space), " ", word)
|
77 |
+
if len(delete_chars) > 0:
|
78 |
+
word = re.sub(r"[{}]".format(delete_chars), "", word)
|
79 |
+
|
80 |
+
word = word.replace(",", " , ")
|
81 |
+
text.extend(word)
|
82 |
+
|
83 |
return text
|