Spaces:
Running
Running
Mahiruoshi
commited on
Commit
•
ea40339
1
Parent(s):
1a0a988
Update tools/sentence.py
Browse files- tools/sentence.py +13 -24
tools/sentence.py
CHANGED
@@ -107,6 +107,8 @@ def remove_numeric_annotations(text):
|
|
107 |
pattern = r'“\d+”|【\d+】|〔\d+〕'
|
108 |
# 使用正则表达式替换掉这些注释
|
109 |
cleaned_text = re.sub(pattern, '', text)
|
|
|
|
|
110 |
return cleaned_text
|
111 |
|
112 |
def merge_adjacent_japanese(sentences):
|
@@ -128,11 +130,14 @@ def extrac(text):
|
|
128 |
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
129 |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
130 |
# 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
|
131 |
-
preliminary_sentences = re.split(r'(?<=[\n
|
132 |
final_sentences = []
|
133 |
|
134 |
for piece in preliminary_sentences:
|
135 |
if is_single_language(piece):
|
|
|
|
|
|
|
136 |
final_sentences.append(piece)
|
137 |
else:
|
138 |
sub_sentences = split_mixed_language(piece)
|
@@ -199,7 +204,7 @@ def extract_text_from_file(inputFile):
|
|
199 |
def split_by_punctuation(sentence):
|
200 |
"""按照中文次级标点符号分割句子"""
|
201 |
# 常见的中文次级分隔符号:逗号、分号等
|
202 |
-
parts = re.split(r'([
|
203 |
# 将标点符号与前面的词语合并,避免单独标点符号成为一个部分
|
204 |
merged_parts = []
|
205 |
for part in parts:
|
@@ -211,29 +216,13 @@ def split_by_punctuation(sentence):
|
|
211 |
|
212 |
def split_long_sentences(sentence, max_length=30):
|
213 |
"""如果中文句子太长,先按标点分割,必要时使用jieba进行分词并分割"""
|
214 |
-
if len(sentence) > max_length and is_chinese(sentence):
|
215 |
-
# 首先尝试按照次级标点符号分割
|
216 |
-
preliminary_parts = split_by_punctuation(sentence)
|
217 |
-
new_sentences = []
|
218 |
-
|
219 |
-
for part in preliminary_parts:
|
220 |
-
# 如果部分仍然太长,使用jieba进行分词
|
221 |
-
if len(part) > max_length:
|
222 |
-
words = jieba.lcut(part)
|
223 |
-
current_sentence = ""
|
224 |
-
for word in words:
|
225 |
-
if len(current_sentence) + len(word) > max_length:
|
226 |
-
new_sentences.append(current_sentence)
|
227 |
-
current_sentence = word
|
228 |
-
else:
|
229 |
-
current_sentence += word
|
230 |
-
if current_sentence:
|
231 |
-
new_sentences.append(current_sentence)
|
232 |
-
else:
|
233 |
-
new_sentences.append(part)
|
234 |
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
|
|
237 |
|
238 |
def extract_and_convert(text):
|
239 |
|
|
|
107 |
pattern = r'“\d+”|【\d+】|〔\d+〕'
|
108 |
# 使用正则表达式替换掉这些注释
|
109 |
cleaned_text = re.sub(pattern, '', text)
|
110 |
+
cleaned_text = re.sub('「', '', cleaned_text)
|
111 |
+
cleaned_text = re.sub('」', '', cleaned_text)
|
112 |
return cleaned_text
|
113 |
|
114 |
def merge_adjacent_japanese(sentences):
|
|
|
130 |
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
131 |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
132 |
# 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
|
133 |
+
preliminary_sentences = re.split(r'(?<=[\n。;!?\.\?!。])', text)
|
134 |
final_sentences = []
|
135 |
|
136 |
for piece in preliminary_sentences:
|
137 |
if is_single_language(piece):
|
138 |
+
if len(piece) > 15:
|
139 |
+
sub_sentences = split_long_sentences(piece)
|
140 |
+
final_sentences.extend(sub_sentences)
|
141 |
final_sentences.append(piece)
|
142 |
else:
|
143 |
sub_sentences = split_mixed_language(piece)
|
|
|
204 |
def split_by_punctuation(sentence):
|
205 |
"""按照中文次级标点符号分割句子"""
|
206 |
# 常见的中文次级分隔符号:逗号、分号等
|
207 |
+
parts = re.split(r'([,,;;…、『』「」])', sentence)
|
208 |
# 将标点符号与前面的词语合并,避免单独标点符号成为一个部分
|
209 |
merged_parts = []
|
210 |
for part in parts:
|
|
|
216 |
|
217 |
def split_long_sentences(sentence, max_length=30):
|
218 |
"""如果中文句子太长,先按标点分割,必要时使用jieba进行分词并分割"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
+
preliminary_parts = split_by_punctuation(sentence)
|
221 |
+
new_sentences = []
|
222 |
+
|
223 |
+
for part in preliminary_parts:
|
224 |
+
new_sentences.append(part)
|
225 |
+
return new_sentences
|
226 |
|
227 |
def extract_and_convert(text):
|
228 |
|