txya900619 commited on
Commit
9b0e12a
·
1 Parent(s): 08c0111

feat: update parse_ipa to fit new lexicon format

Browse files
Files changed (1) hide show
  1. ipa/ipa.py +15 -20
ipa/ipa.py CHANGED
@@ -63,26 +63,21 @@ def get_ipa(raw_text, dialect):
63
 
64
  return final_words, final_ipa, final_pinyin, missing_words
65
 
66
- def parse_ipa(ipa: str):
67
  text = []
68
- ipa_list = re.split(r"(?<![, -])(?=[, -])|(?<=[, -])(?![, -])",ipa)
69
- # tone as a separate token
70
- for phoneme_with_tone in ipa_list:
71
- if phoneme_with_tone ==" ":
72
- text.append(phoneme_with_tone)
73
- continue
74
- elif phoneme_with_tone == ",":
75
- text.extend(" , ")
76
- continue
77
- elif phoneme_with_tone == "-": # use " " split 詞 (or use " " to split 字)
78
- continue
79
-
80
- split_phoneme_and_tone = phoneme_with_tone.split("_")
81
-
82
- if len(split_phoneme_and_tone) == 2:
83
- phoneme, tone = split_phoneme_and_tone
84
- text.extend(phoneme)
85
- text.append(tone)
86
  else:
87
- text.extend(split_phoneme_and_tone[0])
 
 
 
 
 
 
 
88
  return text
 
63
 
64
  return final_words, final_ipa, final_pinyin, missing_words
65
 
66
+ def parse_ipa(ipa: str, delete_chars="\+\-\|\_", as_space=""):
67
  text = []
68
+
69
+ ipa_list = re.split(r"(?<![\d])(?=[\d])|(?<=[\d])(?![\d])", ipa)
70
+ print(ipa_list)
71
+ for word in ipa_list:
72
+ if word.isdigit():
73
+ text.append(word)
 
 
 
 
 
 
 
 
 
 
 
 
74
  else:
75
+ if len(as_space) > 0:
76
+ word = re.sub(r"[{}]".format(as_space), " ", word)
77
+ if len(delete_chars) > 0:
78
+ word = re.sub(r"[{}]".format(delete_chars), "", word)
79
+
80
+ word = word.replace(",", " , ")
81
+ text.extend(word)
82
+
83
  return text