|
import re
|
|
|
|
|
|
def extract_language_and_text_updated(speaker, dialogue):
|
|
|
|
pattern_language_text = r"<(\S+?)>([^<]+)"
|
|
matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
|
|
speaker = speaker[1:-1]
|
|
|
|
matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
|
|
matches_cleaned.append(speaker)
|
|
return matches_cleaned
|
|
|
|
|
|
def validate_text(input_text):
|
|
|
|
pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
|
|
|
|
|
|
matches = re.findall(pattern_speaker, input_text, re.DOTALL)
|
|
|
|
|
|
for _, dialogue in matches:
|
|
language_text_matches = extract_language_and_text_updated(_, dialogue)
|
|
if not language_text_matches:
|
|
return (
|
|
False,
|
|
"Error: Invalid format detected in dialogue content. Please check your input.",
|
|
)
|
|
|
|
|
|
if not matches:
|
|
return (
|
|
False,
|
|
"Error: No valid speaker format detected. Please check your input.",
|
|
)
|
|
|
|
return True, "Input is valid."
|
|
|
|
|
|
def text_matching(text: str) -> list:
|
|
speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
|
|
matches = re.findall(speaker_pattern, text, re.DOTALL)
|
|
result = []
|
|
for speaker, dialogue in matches:
|
|
result.append(extract_language_and_text_updated(speaker, dialogue))
|
|
return result
|
|
|
|
|
|
def cut_para(text):
|
|
splitted_para = re.split("[\n]", text)
|
|
splitted_para = [
|
|
sentence.strip() for sentence in splitted_para if sentence.strip()
|
|
]
|
|
return splitted_para
|
|
|
|
|
|
def cut_sent(para):
|
|
para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para)
|
|
para = re.sub("(\.{6})([^”’])", r"\1\n\2", para)
|
|
para = re.sub("(\…{2})([^”’])", r"\1\n\2", para)
|
|
para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para)
|
|
para = para.rstrip()
|
|
return para.split("\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
text = """
|
|
[说话人1]
|
|
[说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗?
|
|
[说话人3]<zh>谢谢。<jp>どういたしまして。
|
|
"""
|
|
text_matching(text)
|
|
|
|
test_text = """
|
|
[说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。
|
|
[说话人2]<zh>你好吗?
|
|
"""
|
|
text_matching(test_text)
|
|
res = validate_text(test_text)
|
|
print(res)
|
|
|