Spaces:
Running
Running
File size: 7,651 Bytes
960cd20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import logging
import regex as re
from utils.data_utils import check_is_none
from utils.classify_language import classify_language, split_alpha_nonalpha
def _expand_abbreviations(text):
pattern = r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z])'
return re.sub(pattern, ' ', text)
def _expand_hyphens(text):
pattern = r'(?<=[a-zA-Z])-(?=[a-zA-Z])'
expanded_text = re.sub(pattern, ' ', text)
return expanded_text
def markup_language(text: str, target_languages: list = None) -> str:
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
sentences = re.split(pattern, text)
pre_lang = ""
p = 0
new_sentences = []
for sentence in sentences:
new_sentences.extend(split_alpha_nonalpha(sentence))
sentences = new_sentences
for sentence in sentences:
if check_is_none(sentence): continue
lang = classify_language(sentence, target_languages)
if pre_lang == "":
text = text[:p] + text[p:].replace(sentence, f"[{lang.upper()}]{sentence}", 1)
p += len(f"[{lang.upper()}]")
elif pre_lang != lang:
text = text[:p] + text[p:].replace(sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1)
p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
pre_lang = lang
p += text[p:].index(sentence) + len(sentence)
text += f"[{pre_lang.upper()}]"
return text
def split_languages(text: str, target_languages: list = None, segment_size: int = 50,
expand_abbreviations: bool = False, expand_hyphens: bool = False) -> list:
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
r'\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
sentences = re.split(pattern, text)
pre_lang = ""
start = 0
end = 0
sentences_list = []
new_sentences = []
for sentence in sentences:
new_sentences.extend(split_alpha_nonalpha(sentence))
sentences = new_sentences
for sentence in sentences:
if check_is_none(sentence):
continue
lang = classify_language(sentence, target_languages)
end += text[end:].index(sentence)
if pre_lang != "" and pre_lang != lang:
_text = text[start:end]
if pre_lang == "en":
if expand_abbreviations:
_text = _expand_abbreviations(_text)
if _expand_hyphens:
_text = _expand_hyphens(_text)
if len(_text) >= segment_size:
for i in sentence_split(_text, segment_size):
sentences_list.append((i, pre_lang))
else:
sentences_list.append((_text, pre_lang))
start = end
end += len(sentence)
pre_lang = lang
_text = text[start:]
if pre_lang == "en":
if expand_abbreviations:
_text = _expand_abbreviations(_text)
if _expand_hyphens:
_text = _expand_hyphens(_text)
if len(_text) >= segment_size:
for i in sentence_split(_text, segment_size):
sentences_list.append((i, pre_lang))
else:
sentences_list.append((_text, pre_lang))
return sentences_list
def sentence_split(text: str, segment_size: int) -> list:
# Split text into paragraphs
paragraphs = re.split(r'\r\n|\n', text)
pattern = r'[!(),—+\-.:;??。,、;:]+'
sentences_list = []
for paragraph in paragraphs:
sentences = re.split(pattern, paragraph)
discarded_chars = re.findall(pattern, paragraph)
count, p = 0, 0
# Iterate over the symbols by which it is split
for i, discarded_char in enumerate(discarded_chars):
count += len(sentences[i]) + len(discarded_char)
if count >= segment_size:
sentences_list.append(paragraph[p:p + count].strip())
p += count
count = 0
# Add the remaining text
if len(paragraph) - p > 0:
if len(paragraph) - p <= 4 and len(sentences_list) > 0:
sentences_list[-1] += paragraph[p:]
else:
sentences_list.append(paragraph[p:])
# Uncomment the following lines if you want to log the sentences
# for sentence in sentences_list:
# logging.debug(sentence)
return sentences_list
def sentence_split_reading(text: str) -> list:
pattern = r'“[^“”]*”|[^“”]+'
parts = re.findall(pattern, text)
sentences_list = []
for part in parts:
if part:
is_quote = part.startswith("“") and part.endswith("”") and part[-2] in "!!?。,;……?!.,;"
if is_quote:
sentence = part.strip("“”")
sentences_list.append((sentence, is_quote))
else:
if len(sentences_list) > 0 and not sentences_list[-1][1]:
sentences_list[-1] = (sentences_list[-1][0] + part, sentences_list[-1][1])
else:
sentences_list.append((part, is_quote))
return sentences_list
def sentence_split_and_markup(text, segment_size=50, lang="auto", speaker_lang=None):
# 如果该speaker只支持一种语言
if speaker_lang is not None and len(speaker_lang) == 1:
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
logging.debug(
f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
lang = speaker_lang[0]
sentences_list = []
if lang.upper() != "MIX":
if segment_size <= 0:
sentences_list.append(
markup_language(text,
speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
else:
for i in sentence_split(text, segment_size):
if check_is_none(i): continue
sentences_list.append(
markup_language(i,
speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
else:
sentences_list.append(text)
for i in sentences_list:
logging.debug(i)
return sentences_list
if __name__ == '__main__':
text = """这几天心里颇不宁静。
今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"""
# print(markup_language(text, target_languages=None))
print(sentence_split(text, segment_size=50))
# print(sentence_split_and_markup(text, segment_size=50, lang="auto", speaker_lang=None))
# text = "你好hello,这是一段用来测试vits自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging."
# print(split_languages(text, ["zh", "ja", "en"]))
|