File size: 2,477 Bytes
d358e26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
from pypinyin import lazy_pinyin, Style
from .custom_pypinyin_dict import phrase_pinyin_data
phrase_pinyin_data.load()
import jieba
from .cn2an import an2cn

# ζ ‡η‚Ήη¬¦ε·ζ­£εˆ™
punc_map = {
    ":": ",",
    "οΌ›": ",",
    ",": ",",
    "。": ".",
    "!": "!",
    "?": "?",
    "\n": ".",
    "Β·": ",",
    "、": ",",
    "$": ".",
    "β€œ": "'",
    "”": "'",
    '"': "'",
    "β€˜": "'",
    "’": "'",
    "(": "'",
    "οΌ‰": "'",
    "(": "'",
    ")": "'",
    "γ€Š": "'",
    "》": "'",
    "【": "'",
    "】": "'",
    "[": "'",
    "]": "'",
    "β€”": "-",
    "~": "~",
    "γ€Œ": "'",
    "」": "'",
    "γ€Ž": "'", 
    "』": "'",
}
punc_table = str.maketrans(punc_map)

# ζ•°ε­—ζ­£εˆ™εŒ–
number_pattern = re.compile(r'\d+(?:\.?\d+)?')
def replace_number(match):
    return an2cn(match.group())
def normalize_number(text):
    return number_pattern.sub(replace_number, text)

# get symbols of phones
def load_pinyin_symbols(path):
    pinyin_dict={}
    temp = []
    with open(path, "r", encoding='utf-8') as f:
        content = f.readlines()
    for line in content:
        cuts = line.strip().split(',')
        pinyin = cuts[0]
        phones = cuts[1].split(' ')
        pinyin_dict[pinyin] = phones
        temp.extend(phones)
    temp = list(set(temp))
    tone = []
    for phone in temp:
        for i in range(1, 6):
            phone2 = phone + str(i)
            tone.append(phone2)
    print(sorted(tone, key=lambda x: len(x)))
    return pinyin_dict

def load_pinyin_dict(path):
    pinyin_dict = {}
    with open(path, "r", encoding='utf-8') as f:
        for line in f:
            key, value = line.strip().split(',', 1)
            pinyin_dict[key] = value.split()
    return pinyin_dict
pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt')

def chinese_to_cnm3(text: str):
    text = text.translate(punc_table)
    text = normalize_number(text)
    words = jieba.lcut(text, cut_all=False)
    phones = []
    for word in words:
        pinyin_list = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True)
        for pinyin in pinyin_list:
            if pinyin[-1].isdigit():
                tone = pinyin[-1]
                syllable = pinyin[:-1]
                phone = pinyin_dict[syllable]
                phones.extend([ph + tone for ph in phone])
            elif pinyin[-1].isalpha():
                pass
            else:
                phones.extend(pinyin)
    return phones