File size: 9,065 Bytes
f34bda5
01e655b
 
 
02e90e4
 
 
ebc4336
 
 
 
02e90e4
f34bda5
02e90e4
 
 
 
 
01e655b
f34bda5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84cfd61
 
01e655b
 
 
 
 
84cfd61
 
01e655b
 
 
02e90e4
 
 
01e655b
 
 
 
 
 
02e90e4
 
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f34bda5
02e90e4
01e655b
 
 
 
 
 
 
 
 
 
 
 
02e90e4
 
 
 
 
ebc4336
 
02e90e4
d8e7d56
f34bda5
 
d8e7d56
02e90e4
 
 
 
 
 
 
 
 
 
 
 
 
 
84cfd61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f34bda5
01e655b
 
84cfd61
01e655b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84cfd61
 
 
 
 
 
 
 
 
 
02e90e4
 
 
 
 
 
 
 
 
 
b473486
 
 
 
01e655b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
from functools import lru_cache
from modules.utils.zh_normalization.text_normlization import *
import emojiswitch
from modules.utils.markdown import markdown_to_text
from modules import models
import re

# 是否关闭 unk token 检查
# NOTE: 单测的时候用于跳过模型加载
DISABLE_UNK_TOKEN_CHECK = False


@lru_cache(maxsize=64)
def is_chinese(text):
    # 中文字符的 Unicode 范围是 \u4e00-\u9fff
    chinese_pattern = re.compile(r"[\u4e00-\u9fff]")
    return bool(chinese_pattern.search(text))


@lru_cache(maxsize=64)
def is_eng(text):
    eng_pattern = re.compile(r"[a-zA-Z]")
    return bool(eng_pattern.search(text))


@lru_cache(maxsize=64)
def guess_lang(text):
    if is_chinese(text):
        return "zh"
    if is_eng(text):
        return "en"
    return "zh"


post_normalize_pipeline = []
pre_normalize_pipeline = []


def post_normalize():
    def decorator(func):
        post_normalize_pipeline.append(func)
        return func

    return decorator


def pre_normalize():
    def decorator(func):
        pre_normalize_pipeline.append(func)
        return func

    return decorator


def apply_pre_normalize(text):
    for func in pre_normalize_pipeline:
        text = func(text)
    return text


def apply_post_normalize(text):
    for func in post_normalize_pipeline:
        text = func(text)
    return text


def is_markdown(text):
    markdown_patterns = [
        r"(^|\s)#[^#]",  # 标题
        r"\*\*.*?\*\*",  # 加粗
        r"\*.*?\*",  # 斜体
        r"!\[.*?\]\(.*?\)",  # 图片
        r"\[.*?\]\(.*?\)",  # 链接
        r"`[^`]+`",  # 行内代码
        r"```[\s\S]*?```",  # 代码块
        r"(^|\s)\* ",  # 无序列表
        r"(^|\s)\d+\. ",  # 有序列表
        r"(^|\s)> ",  # 引用
        r"(^|\s)---",  # 分隔线
    ]

    for pattern in markdown_patterns:
        if re.search(pattern, text, re.MULTILINE):
            return True

    return False


character_map = {
    ":": ",",
    ";": ",",
    "!": "。",
    "(": ",",
    ")": ",",
    "【": ",",
    "】": ",",
    "『": ",",
    "』": ",",
    "「": ",",
    "」": ",",
    "《": ",",
    "》": ",",
    "-": ",",
    "‘": " ",
    "“": " ",
    "’": " ",
    "”": " ",
    '"': " ",
    "'": " ",
    ":": ",",
    ";": ",",
    "!": ".",
    "(": ",",
    ")": ",",
    "[": ",",
    "]": ",",
    ">": ",",
    "<": ",",
    "-": ",",
    "~": " ",
    "~": " ",
    "/": " ",
}

character_to_word = {
    " & ": " and ",
}

## ---------- post normalize ----------


@post_normalize()
def apply_character_to_word(text):
    for k, v in character_to_word.items():
        text = text.replace(k, v)
    return text


@post_normalize()
def apply_character_map(text):
    translation_table = str.maketrans(character_map)
    return text.translate(translation_table)


@post_normalize()
def apply_emoji_map(text):
    lang = guess_lang(text)
    return emojiswitch.demojize(text, delimiters=("", ""), lang=lang)


@post_normalize()
def insert_spaces_between_uppercase(s):
    # 使用正则表达式在每个相邻的大写字母之间插入空格
    return re.sub(
        r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])",
        " ",
        s,
    )


@post_normalize()
def replace_unk_tokens(text):
    """
    把不在字典里的字符替换为 " , "
    """
    if DISABLE_UNK_TOKEN_CHECK:
        return text
    chat_tts = models.load_chat_tts()
    if "tokenizer" not in chat_tts.pretrain_models:
        # 这个地方只有在 huggingface spaces 中才会触发
        # 因为 hugggingface 自动处理模型卸载加载,所以如果拿不到就算了...
        return text
    tokenizer = chat_tts.pretrain_models["tokenizer"]
    vocab = tokenizer.get_vocab()
    vocab_set = set(vocab.keys())
    # 添加所有英语字符
    vocab_set.update(set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"))
    vocab_set.update(set(" \n\r\t"))
    replaced_chars = [char if char in vocab_set else " , " for char in text]
    output_text = "".join(replaced_chars)
    return output_text


## ---------- pre normalize ----------


@pre_normalize()
def apply_markdown_to_text(text):
    if is_markdown(text):
        text = markdown_to_text(text)
    return text


# 将 "xxx" => \nxxx\n
# 将 'xxx' => \nxxx\n
@pre_normalize()
def replace_quotes(text):
    repl = r"\n\1\n"
    patterns = [
        ['"', '"'],
        ["'", "'"],
        ["“", "”"],
        ["‘", "’"],
    ]
    for p in patterns:
        text = re.sub(rf"({p[0]}[^{p[0]}{p[1]}]+?{p[1]})", repl, text)
    return text


def ensure_suffix(a: str, b: str, c: str):
    a = a.strip()
    if not a.endswith(b):
        a += c
    return a


email_domain_map = {
    "outlook.com": "Out look",
    "hotmail.com": "Hot mail",
    "yahoo.com": "雅虎",
}


# 找到所有 email 并将 name 分割为单个字母,@替换为 at ,. 替换为 dot,常见域名替换为单词
#
# 例如:
# zhzluke96@outlook.com => z h z l u k e 9 6 at out look dot com
def email_detect(text):
    email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")

    def replace(match):
        email = match.group(1)
        name, domain = email.split("@")
        name = " ".join(name)
        if domain in email_domain_map:
            domain = email_domain_map[domain]
        domain = domain.replace(".", " dot ")
        return f"{name} at {domain}"

    return email_pattern.sub(replace, text)


def sentence_normalize(sentence_text: str):
    # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
    tx = TextNormalizer()

    # 匹配 \[.+?\] 的部分
    pattern = re.compile(r"(\[.+?\])|([^[]+)")

    def normalize_part(part):
        sentences = tx.normalize(part) if guess_lang(part) == "zh" else [part]
        dest_text = ""
        for sentence in sentences:
            sentence = apply_post_normalize(sentence)
            dest_text += sentence
        return dest_text

    def replace(match):
        if match.group(1):
            return f" {match.group(1)} "
        else:
            return normalize_part(match.group(2))

    result = pattern.sub(replace, sentence_text)

    # NOTE: 加了会有杂音...
    # if is_end:
    # 加这个是为了防止吞字
    # result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")

    return result


def text_normalize(text, is_end=False):
    text = apply_pre_normalize(text)
    lines = text.split("\n")
    lines = [line.strip() for line in lines]
    lines = [line for line in lines if line]
    lines = [sentence_normalize(line) for line in lines]
    content = "\n".join(lines)
    return content


if __name__ == "__main__":
    test_cases = [
        "ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本.",
        " [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149",
        " 明天有62%的概率降雨",
        "大🍌,一条大🍌,嘿,你的感觉真的很奇妙  [lbreak]",
        """
# 你好,世界
```js
console.log('1')
```
**加粗**

*一条文本*
        """,
        """
在沙漠、岩石、雪地上行走了很长的时间以后,小王子终于发现了一条大路。所有的大路都是通往人住的地方的。
“你们好。”小王子说。
这是一个玫瑰盛开的花园。
“你好。”玫瑰花说道。
小王子瞅着这些花,它们全都和他的那朵花一样。
“你们是什么花?”小王子惊奇地问。
“我们是玫瑰花。”花儿们说道。
“啊!”小王子说……。
        """,
        """
State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX.

🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:

📝 Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.
🖼️ Computer Vision: image classification, object detection, and segmentation.
🗣️ Audio: automatic speech recognition and audio classification.
🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
        """,
        """
120米
有12%的概率会下雨
""",
    ]

    for i, test_case in enumerate(test_cases):
        print(f"case {i}:\n", {"x": text_normalize(test_case, is_end=True)})