|
import os |
|
import sys |
|
import re |
|
from pypinyin import lazy_pinyin, BOPOMOFO |
|
import jieba |
|
import cn2an |
|
import logging |
|
|
|
logging.getLogger('jieba').setLevel(logging.WARNING) |
|
jieba.set_dictionary(r'./jieba/dict.txt') |
|
jieba.initialize() |
|
|
|
|
|
|
|
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ |
|
('a', 'ㄟˉ'), |
|
('b', 'ㄅㄧˋ'), |
|
('c', 'ㄙㄧˉ'), |
|
('d', 'ㄉㄧˋ'), |
|
('e', 'ㄧˋ'), |
|
('f', 'ㄝˊㄈㄨˋ'), |
|
('g', 'ㄐㄧˋ'), |
|
('h', 'ㄝˇㄑㄩˋ'), |
|
('i', 'ㄞˋ'), |
|
('j', 'ㄐㄟˋ'), |
|
('k', 'ㄎㄟˋ'), |
|
('l', 'ㄝˊㄛˋ'), |
|
('m', 'ㄝˊㄇㄨˋ'), |
|
('n', 'ㄣˉ'), |
|
('o', 'ㄡˉ'), |
|
('p', 'ㄆㄧˉ'), |
|
('q', 'ㄎㄧㄡˉ'), |
|
('r', 'ㄚˋ'), |
|
('s', 'ㄝˊㄙˋ'), |
|
('t', 'ㄊㄧˋ'), |
|
('u', 'ㄧㄡˉ'), |
|
('v', 'ㄨㄧˉ'), |
|
('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'), |
|
('x', 'ㄝˉㄎㄨˋㄙˋ'), |
|
('y', 'ㄨㄞˋ'), |
|
('z', 'ㄗㄟˋ') |
|
]] |
|
|
|
|
|
_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ |
|
('ㄅㄛ', 'p⁼wo'), |
|
('ㄆㄛ', 'pʰwo'), |
|
('ㄇㄛ', 'mwo'), |
|
('ㄈㄛ', 'fwo'), |
|
('ㄅ', 'p⁼'), |
|
('ㄆ', 'pʰ'), |
|
('ㄇ', 'm'), |
|
('ㄈ', 'f'), |
|
('ㄉ', 't⁼'), |
|
('ㄊ', 'tʰ'), |
|
('ㄋ', 'n'), |
|
('ㄌ', 'l'), |
|
('ㄍ', 'k⁼'), |
|
('ㄎ', 'kʰ'), |
|
('ㄏ', 'h'), |
|
('ㄐ', 'ʧ⁼'), |
|
('ㄑ', 'ʧʰ'), |
|
('ㄒ', 'ʃ'), |
|
('ㄓ', 'ʦ`⁼'), |
|
('ㄔ', 'ʦ`ʰ'), |
|
('ㄕ', 's`'), |
|
('ㄖ', 'ɹ`'), |
|
('ㄗ', 'ʦ⁼'), |
|
('ㄘ', 'ʦʰ'), |
|
('ㄙ', 's'), |
|
('ㄚ', 'a'), |
|
('ㄛ', 'o'), |
|
('ㄜ', 'ə'), |
|
('ㄝ', 'e'), |
|
('ㄞ', 'ai'), |
|
('ㄟ', 'ei'), |
|
('ㄠ', 'au'), |
|
('ㄡ', 'ou'), |
|
('ㄧㄢ', 'yeNN'), |
|
('ㄢ', 'aNN'), |
|
('ㄧㄣ', 'iNN'), |
|
('ㄣ', 'əNN'), |
|
('ㄤ', 'aNg'), |
|
('ㄧㄥ', 'iNg'), |
|
('ㄨㄥ', 'uNg'), |
|
('ㄩㄥ', 'yuNg'), |
|
('ㄥ', 'əNg'), |
|
('ㄦ', 'əɻ'), |
|
('ㄧ', 'i'), |
|
('ㄨ', 'u'), |
|
('ㄩ', 'ɥ'), |
|
('ˉ', '→'), |
|
('ˊ', '↑'), |
|
('ˇ', '↓↑'), |
|
('ˋ', '↓'), |
|
('˙', ''), |
|
(',', ','), |
|
('。', '.'), |
|
('!', '!'), |
|
('?', '?'), |
|
('—', '-') |
|
]] |
|
|
|
|
|
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ |
|
('ʃy', 'ʃ'), |
|
('ʧʰy', 'ʧʰ'), |
|
('ʧ⁼y', 'ʧ⁼'), |
|
('NN', 'n'), |
|
('Ng', 'ŋ'), |
|
('y', 'j'), |
|
('h', 'x') |
|
]] |
|
|
|
|
|
_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [ |
|
('ㄅㄛ', 'p⁼wo'), |
|
('ㄆㄛ', 'pʰwo'), |
|
('ㄇㄛ', 'mwo'), |
|
('ㄈㄛ', 'fwo'), |
|
('ㄅ', 'p⁼'), |
|
('ㄆ', 'pʰ'), |
|
('ㄇ', 'm'), |
|
('ㄈ', 'f'), |
|
('ㄉ', 't⁼'), |
|
('ㄊ', 'tʰ'), |
|
('ㄋ', 'n'), |
|
('ㄌ', 'l'), |
|
('ㄍ', 'k⁼'), |
|
('ㄎ', 'kʰ'), |
|
('ㄏ', 'x'), |
|
('ㄐ', 'tʃ⁼'), |
|
('ㄑ', 'tʃʰ'), |
|
('ㄒ', 'ʃ'), |
|
('ㄓ', 'ts`⁼'), |
|
('ㄔ', 'ts`ʰ'), |
|
('ㄕ', 's`'), |
|
('ㄖ', 'ɹ`'), |
|
('ㄗ', 'ts⁼'), |
|
('ㄘ', 'tsʰ'), |
|
('ㄙ', 's'), |
|
('ㄚ', 'a'), |
|
('ㄛ', 'o'), |
|
('ㄜ', 'ə'), |
|
('ㄝ', 'ɛ'), |
|
('ㄞ', 'aɪ'), |
|
('ㄟ', 'eɪ'), |
|
('ㄠ', 'ɑʊ'), |
|
('ㄡ', 'oʊ'), |
|
('ㄧㄢ', 'jɛn'), |
|
('ㄩㄢ', 'ɥæn'), |
|
('ㄢ', 'an'), |
|
('ㄧㄣ', 'in'), |
|
('ㄩㄣ', 'ɥn'), |
|
('ㄣ', 'ən'), |
|
('ㄤ', 'ɑŋ'), |
|
('ㄧㄥ', 'iŋ'), |
|
('ㄨㄥ', 'ʊŋ'), |
|
('ㄩㄥ', 'jʊŋ'), |
|
('ㄥ', 'əŋ'), |
|
('ㄦ', 'əɻ'), |
|
('ㄧ', 'i'), |
|
('ㄨ', 'u'), |
|
('ㄩ', 'ɥ'), |
|
('ˉ', '→'), |
|
('ˊ', '↑'), |
|
('ˇ', '↓↑'), |
|
('ˋ', '↓'), |
|
('˙', ''), |
|
(',', ','), |
|
('。', '.'), |
|
('!', '!'), |
|
('?', '?'), |
|
('—', '-') |
|
]] |
|
|
|
|
|
_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [ |
|
('ㄅㄛ', 'pwo'), |
|
('ㄆㄛ', 'pʰwo'), |
|
('ㄇㄛ', 'mwo'), |
|
('ㄈㄛ', 'fwo'), |
|
('ㄅ', 'p'), |
|
('ㄆ', 'pʰ'), |
|
('ㄇ', 'm'), |
|
('ㄈ', 'f'), |
|
('ㄉ', 't'), |
|
('ㄊ', 'tʰ'), |
|
('ㄋ', 'n'), |
|
('ㄌ', 'l'), |
|
('ㄍ', 'k'), |
|
('ㄎ', 'kʰ'), |
|
('ㄏ', 'h'), |
|
('ㄐ', 'tɕ'), |
|
('ㄑ', 'tɕʰ'), |
|
('ㄒ', 'ɕ'), |
|
('ㄓ', 'tʂ'), |
|
('ㄔ', 'tʂʰ'), |
|
('ㄕ', 'ʂ'), |
|
('ㄖ', 'ɻ'), |
|
('ㄗ', 'ts'), |
|
('ㄘ', 'tsʰ'), |
|
('ㄙ', 's'), |
|
('ㄚ', 'a'), |
|
('ㄛ', 'o'), |
|
('ㄜ', 'ɤ'), |
|
('ㄝ', 'ɛ'), |
|
('ㄞ', 'aɪ'), |
|
('ㄟ', 'eɪ'), |
|
('ㄠ', 'ɑʊ'), |
|
('ㄡ', 'oʊ'), |
|
('ㄧㄢ', 'jɛn'), |
|
('ㄩㄢ', 'yæn'), |
|
('ㄢ', 'an'), |
|
('ㄧㄣ', 'in'), |
|
('ㄩㄣ', 'yn'), |
|
('ㄣ', 'ən'), |
|
('ㄤ', 'ɑŋ'), |
|
('ㄧㄥ', 'iŋ'), |
|
('ㄨㄥ', 'ʊŋ'), |
|
('ㄩㄥ', 'jʊŋ'), |
|
('ㄥ', 'ɤŋ'), |
|
('ㄦ', 'əɻ'), |
|
('ㄧ', 'i'), |
|
('ㄨ', 'u'), |
|
('ㄩ', 'y'), |
|
('ˉ', '˥'), |
|
('ˊ', '˧˥'), |
|
('ˇ', '˨˩˦'), |
|
('ˋ', '˥˩'), |
|
('˙', ''), |
|
(',', ','), |
|
('。', '.'), |
|
('!', '!'), |
|
('?', '?'), |
|
('—', '-') |
|
]] |
|
|
|
|
|
def number_to_chinese(text): |
|
numbers = re.findall(r'\d+(?:\.?\d+)?', text) |
|
for number in numbers: |
|
text = text.replace(number, cn2an.an2cn(number), 1) |
|
return text |
|
|
|
|
|
def chinese_to_bopomofo(text): |
|
text = text.replace('、', ',').replace(';', ',').replace(':', ',') |
|
words = jieba.lcut(text, cut_all=False) |
|
text = '' |
|
for word in words: |
|
bopomofos = lazy_pinyin(word, BOPOMOFO) |
|
if not re.search('[\u4e00-\u9fff]', word): |
|
text += word |
|
continue |
|
for i in range(len(bopomofos)): |
|
bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i]) |
|
if text != '': |
|
text += ' ' |
|
text += ''.join(bopomofos) |
|
return text |
|
|
|
|
|
def latin_to_bopomofo(text): |
|
for regex, replacement in _latin_to_bopomofo: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def bopomofo_to_romaji(text): |
|
for regex, replacement in _bopomofo_to_romaji: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def bopomofo_to_ipa(text): |
|
for regex, replacement in _bopomofo_to_ipa: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def bopomofo_to_ipa2(text): |
|
for regex, replacement in _bopomofo_to_ipa2: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def chinese_to_romaji(text): |
|
text = number_to_chinese(text) |
|
text = chinese_to_bopomofo(text) |
|
text = latin_to_bopomofo(text) |
|
text = bopomofo_to_romaji(text) |
|
text = re.sub('i([aoe])', r'y\1', text) |
|
text = re.sub('u([aoəe])', r'w\1', text) |
|
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', |
|
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') |
|
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) |
|
return text |
|
|
|
|
|
def chinese_to_lazy_ipa(text): |
|
text = chinese_to_romaji(text) |
|
for regex, replacement in _romaji_to_ipa: |
|
text = re.sub(regex, replacement, text) |
|
return text |
|
|
|
|
|
def chinese_to_ipa(text): |
|
text = number_to_chinese(text) |
|
text = chinese_to_bopomofo(text) |
|
text = latin_to_bopomofo(text) |
|
text = bopomofo_to_ipa(text) |
|
text = re.sub('i([aoe])', r'j\1', text) |
|
text = re.sub('u([aoəe])', r'w\1', text) |
|
text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', |
|
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`') |
|
text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text) |
|
return text |
|
|
|
|
|
def chinese_to_ipa2(text): |
|
text = number_to_chinese(text) |
|
text = chinese_to_bopomofo(text) |
|
text = latin_to_bopomofo(text) |
|
text = bopomofo_to_ipa2(text) |
|
text = re.sub(r'i([aoe])', r'j\1', text) |
|
text = re.sub(r'u([aoəe])', r'w\1', text) |
|
text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text) |
|
text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text) |
|
return text |
|
|