Spaces:

lenML
/

ChatTTS-Forge

Running on Zero

File size: 9,065 Bytes

from functools import lru_cache
from modules.utils.zh_normalization.text_normlization import *
import emojiswitch
from modules.utils.markdown import markdown_to_text
from modules import models
import re

# 是否关闭 unk token 检查
# NOTE: 单测的时候用于跳过模型加载
DISABLE_UNK_TOKEN_CHECK = False


@lru_cache(maxsize=64)
def is_chinese(text):
    # 中文字符的 Unicode 范围是 \u4e00-\u9fff
    chinese_pattern = re.compile(r"[\u4e00-\u9fff]")
    return bool(chinese_pattern.search(text))


@lru_cache(maxsize=64)
def is_eng(text):
    eng_pattern = re.compile(r"[a-zA-Z]")
    return bool(eng_pattern.search(text))


@lru_cache(maxsize=64)
def guess_lang(text):
    if is_chinese(text):
        return "zh"
    if is_eng(text):
        return "en"
    return "zh"


post_normalize_pipeline = []
pre_normalize_pipeline = []


def post_normalize():
    def decorator(func):
        post_normalize_pipeline.append(func)
        return func

    return decorator


def pre_normalize():
    def decorator(func):
        pre_normalize_pipeline.append(func)
        return func

    return decorator


def apply_pre_normalize(text):
    for func in pre_normalize_pipeline:
        text = func(text)
    return text


def apply_post_normalize(text):
    for func in post_normalize_pipeline:
        text = func(text)
    return text


def is_markdown(text):
    markdown_patterns = [
        r"(^|\s)#[^#]",  # 标题
        r"\*\*.*?\*\*",  # 加粗
        r"\*.*?\*",  # 斜体
        r"!\[.*?\]\(.*?\)",  # 图片
        r"\[.*?\]\(.*?\)",  # 链接
        r"`[^`]+`",  # 行内代码
        r"```[\s\S]*?```",  # 代码块
        r"(^|\s)\* ",  # 无序列表
        r"(^|\s)\d+\. ",  # 有序列表
        r"(^|\s)> ",  # 引用
        r"(^|\s)---",  # 分隔线
    ]

    for pattern in markdown_patterns:
        if re.search(pattern, text, re.MULTILINE):
            return True

    return False


character_map = {
    "：": "，",
    "；": "，",
    "！": "。",
    "（": "，",
    "）": "，",
    "【": "，",
    "】": "，",
    "『": "，",
    "』": "，",
    "「": "，",
    "」": "，",
    "《": "，",
    "》": "，",
    "－": "，",
    "‘": " ",
    "“": " ",
    "’": " ",
    "”": " ",
    '"': " ",
    "'": " ",
    ":": ",",
    ";": ",",
    "!": ".",
    "(": ",",
    ")": ",",
    "[": ",",
    "]": ",",
    ">": ",",
    "<": ",",
    "-": ",",
    "~": " ",
    "～": " ",
    "/": " ",
}

character_to_word = {
    " & ": " and ",
}

## ---------- post normalize ----------


@post_normalize()
def apply_character_to_word(text):
    for k, v in character_to_word.items():
        text = text.replace(k, v)
    return text


@post_normalize()
def apply_character_map(text):
    translation_table = str.maketrans(character_map)
    return text.translate(translation_table)


@post_normalize()
def apply_emoji_map(text):
    lang = guess_lang(text)
    return emojiswitch.demojize(text, delimiters=("", ""), lang=lang)


@post_normalize()
def insert_spaces_between_uppercase(s):
    # 使用正则表达式在每个相邻的大写字母之间插入空格
    return re.sub(
        r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])",
        " ",
        s,
    )


@post_normalize()
def replace_unk_tokens(text):
    """
    把不在字典里的字符替换为 " , "
    """
    if DISABLE_UNK_TOKEN_CHECK:
        return text
    chat_tts = models.load_chat_tts()
    if "tokenizer" not in chat_tts.pretrain_models:
        # 这个地方只有在 huggingface spaces 中才会触发
        # 因为 hugggingface 自动处理模型卸载加载，所以如果拿不到就算了...
        return text
    tokenizer = chat_tts.pretrain_models["tokenizer"]
    vocab = tokenizer.get_vocab()
    vocab_set = set(vocab.keys())
    # 添加所有英语字符
    vocab_set.update(set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"))
    vocab_set.update(set(" \n\r\t"))
    replaced_chars = [char if char in vocab_set else " , " for char in text]
    output_text = "".join(replaced_chars)
    return output_text


## ---------- pre normalize ----------


@pre_normalize()
def apply_markdown_to_text(text):
    if is_markdown(text):
        text = markdown_to_text(text)
    return text


# 将 "xxx" => \nxxx\n
# 将 'xxx' => \nxxx\n
@pre_normalize()
def replace_quotes(text):
    repl = r"\n\1\n"
    patterns = [
        ['"', '"'],
        ["'", "'"],
        ["“", "”"],
        ["‘", "’"],
    ]
    for p in patterns:
        text = re.sub(rf"({p[0]}[^{p[0]}{p[1]}]+?{p[1]})", repl, text)
    return text


def ensure_suffix(a: str, b: str, c: str):
    a = a.strip()
    if not a.endswith(b):
        a += c
    return a


email_domain_map = {
    "outlook.com": "Out look",
    "hotmail.com": "Hot mail",
    "yahoo.com": "雅虎",
}


# 找到所有 email 并将 name 分割为单个字母，@替换为 at ，. 替换为 dot，常见域名替换为单词
#
# 例如:
# zhzluke96@outlook.com => z h z l u k e 9 6 at out look dot com
def email_detect(text):
    email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")

    def replace(match):
        email = match.group(1)
        name, domain = email.split("@")
        name = " ".join(name)
        if domain in email_domain_map:
            domain = email_domain_map[domain]
        domain = domain.replace(".", " dot ")
        return f"{name} at {domain}"

    return email_pattern.sub(replace, text)


def sentence_normalize(sentence_text: str):
    # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
    tx = TextNormalizer()

    # 匹配 \[.+?\] 的部分
    pattern = re.compile(r"(\[.+?\])|([^[]+)")

    def normalize_part(part):
        sentences = tx.normalize(part) if guess_lang(part) == "zh" else [part]
        dest_text = ""
        for sentence in sentences:
            sentence = apply_post_normalize(sentence)
            dest_text += sentence
        return dest_text

    def replace(match):
        if match.group(1):
            return f" {match.group(1)} "
        else:
            return normalize_part(match.group(2))

    result = pattern.sub(replace, sentence_text)

    # NOTE: 加了会有杂音...
    # if is_end:
    # 加这个是为了防止吞字
    # result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")

    return result


def text_normalize(text, is_end=False):
    text = apply_pre_normalize(text)
    lines = text.split("\n")
    lines = [line.strip() for line in lines]
    lines = [line for line in lines if line]
    lines = [sentence_normalize(line) for line in lines]
    content = "\n".join(lines)
    return content


if __name__ == "__main__":
    test_cases = [
        "ChatTTS是专门为对话场景设计的文本转语音模型，例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本.",
        " [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149",
        " 明天有62％的概率降雨",
        "大🍌，一条大🍌，嘿，你的感觉真的很奇妙  [lbreak]",
        """
# 你好，世界
```js
console.log('1')
```
**加粗**

*一条文本*
        """,
        """
在沙漠、岩石、雪地上行走了很长的时间以后，小王子终于发现了一条大路。所有的大路都是通往人住的地方的。
“你们好。”小王子说。
这是一个玫瑰盛开的花园。
“你好。”玫瑰花说道。
小王子瞅着这些花，它们全都和他的那朵花一样。
“你们是什么花？”小王子惊奇地问。
“我们是玫瑰花。”花儿们说道。
“啊！”小王子说……。
        """,
        """
State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX.

🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:

📝 Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.
🖼️ Computer Vision: image classification, object detection, and segmentation.
🗣️ Audio: automatic speech recognition and audio classification.
🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
        """,
        """
120米
有12%的概率会下雨
""",
    ]

    for i, test_case in enumerate(test_cases):
        print(f"case {i}:\n", {"x": text_normalize(test_case, is_end=True)})