File size: 2,820 Bytes
9cca5e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re


def extract_language_and_text_updated(speaker, dialogue):
    # 使用正则表达式匹配<语言>标签和其后的文本
    pattern_language_text = r"<(\S+?)>([^<]+)"
    matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
    speaker = speaker[1:-1]
    # 清理文本:去除两边的空白字符
    matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
    matches_cleaned.append(speaker)
    return matches_cleaned


def validate_text(input_text):
    # 验证说话人的正则表达式
    pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"

    # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
    matches = re.findall(pattern_speaker, input_text, re.DOTALL)

    # 对每个匹配到的说话人内容进行进一步验证
    for _, dialogue in matches:
        language_text_matches = extract_language_and_text_updated(_, dialogue)
        if not language_text_matches:
            return (
                False,
                "Error: Invalid format detected in dialogue content. Please check your input.",
            )

    # 如果输入的文本中没有找到任何匹配项
    if not matches:
        return (
            False,
            "Error: No valid speaker format detected. Please check your input.",
        )

    return True, "Input is valid."


def text_matching(text: str) -> list:
    speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
    matches = re.findall(speaker_pattern, text, re.DOTALL)
    result = []
    for speaker, dialogue in matches:
        result.append(extract_language_and_text_updated(speaker, dialogue))
    return result


def cut_para(text):
    splitted_para = re.split("[\n]", text)  # 按段分
    splitted_para = [
        sentence.strip() for sentence in splitted_para if sentence.strip()
    ]  # 删除空字符串
    return splitted_para


def cut_sent(para):
    para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para)  # 单字符断句符
    para = re.sub("(\.{6})([^”’])", r"\1\n\2", para)  # 英文省略号
    para = re.sub("(\…{2})([^”’])", r"\1\n\2", para)  # 中文省略号
    para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para)
    para = para.rstrip()  # 段尾如果有多余的\n就去掉它
    return para.split("\n")


if __name__ == "__main__":
    text = """
    [说话人1]
    [说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗?
    [说话人3]<zh>谢谢。<jp>どういたしまして。
    """
    text_matching(text)
    # 测试函数
    test_text = """
    [说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。
    [说话人2]<zh>你好吗?
    """
    text_matching(test_text)
    res = validate_text(test_text)
    print(res)