Mahiruoshi commited on
Commit
ea40339
1 Parent(s): 1a0a988

Update tools/sentence.py

Browse files
Files changed (1) hide show
  1. tools/sentence.py +13 -24
tools/sentence.py CHANGED
@@ -107,6 +107,8 @@ def remove_numeric_annotations(text):
107
  pattern = r'“\d+”|【\d+】|〔\d+〕'
108
  # 使用正则表达式替换掉这些注释
109
  cleaned_text = re.sub(pattern, '', text)
 
 
110
  return cleaned_text
111
 
112
  def merge_adjacent_japanese(sentences):
@@ -128,11 +130,14 @@ def extrac(text):
128
  text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
129
  text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
130
  # 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
131
- preliminary_sentences = re.split(r'(?<=[\n。;!?\.\?!])', text)
132
  final_sentences = []
133
 
134
  for piece in preliminary_sentences:
135
  if is_single_language(piece):
 
 
 
136
  final_sentences.append(piece)
137
  else:
138
  sub_sentences = split_mixed_language(piece)
@@ -199,7 +204,7 @@ def extract_text_from_file(inputFile):
199
  def split_by_punctuation(sentence):
200
  """按照中文次级标点符号分割句子"""
201
  # 常见的中文次级分隔符号:逗号、分号等
202
- parts = re.split(r'([,,;;])', sentence)
203
  # 将标点符号与前面的词语合并,避免单独标点符号成为一个部分
204
  merged_parts = []
205
  for part in parts:
@@ -211,29 +216,13 @@ def split_by_punctuation(sentence):
211
 
212
  def split_long_sentences(sentence, max_length=30):
213
  """如果中文句子太长,先按标点分割,必要时使用jieba进行分词并分割"""
214
- if len(sentence) > max_length and is_chinese(sentence):
215
- # 首先尝试按照次级标点符号分割
216
- preliminary_parts = split_by_punctuation(sentence)
217
- new_sentences = []
218
-
219
- for part in preliminary_parts:
220
- # 如果部分仍然太长,使用jieba进行分词
221
- if len(part) > max_length:
222
- words = jieba.lcut(part)
223
- current_sentence = ""
224
- for word in words:
225
- if len(current_sentence) + len(word) > max_length:
226
- new_sentences.append(current_sentence)
227
- current_sentence = word
228
- else:
229
- current_sentence += word
230
- if current_sentence:
231
- new_sentences.append(current_sentence)
232
- else:
233
- new_sentences.append(part)
234
 
235
- return new_sentences
236
- return [sentence] # 如果句子不长或不是中文,直接返回
 
 
 
 
237
 
238
  def extract_and_convert(text):
239
 
 
107
  pattern = r'“\d+”|【\d+】|〔\d+〕'
108
  # 使用正则表达式替换掉这些注释
109
  cleaned_text = re.sub(pattern, '', text)
110
+ cleaned_text = re.sub('「', '', cleaned_text)
111
+ cleaned_text = re.sub('」', '', cleaned_text)
112
  return cleaned_text
113
 
114
  def merge_adjacent_japanese(sentences):
 
130
  text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
131
  text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
132
  # 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
133
+ preliminary_sentences = re.split(r'(?<=[\n。;!?\.\?!。])', text)
134
  final_sentences = []
135
 
136
  for piece in preliminary_sentences:
137
  if is_single_language(piece):
138
+ if len(piece) > 15:
139
+ sub_sentences = split_long_sentences(piece)
140
+ final_sentences.extend(sub_sentences)
141
  final_sentences.append(piece)
142
  else:
143
  sub_sentences = split_mixed_language(piece)
 
204
  def split_by_punctuation(sentence):
205
  """按照中文次级标点符号分割句子"""
206
  # 常见的中文次级分隔符号:逗号、分号等
207
+ parts = re.split(r'([,,;;…、『』「」])', sentence)
208
  # 将标点符号与前面的词语合并,避免单独标点符号成为一个部分
209
  merged_parts = []
210
  for part in parts:
 
216
 
217
  def split_long_sentences(sentence, max_length=30):
218
  """如果中文句子太长,先按标点分割,必要时使用jieba进行分词并分割"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
+ preliminary_parts = split_by_punctuation(sentence)
221
+ new_sentences = []
222
+
223
+ for part in preliminary_parts:
224
+ new_sentences.append(part)
225
+ return new_sentences
226
 
227
  def extract_and_convert(text):
228