worldqwq commited on
Commit
03b08f3
·
1 Parent(s): 4657673

Former-commit-id: cd4e3676891d07a7bbd80e6677b003028b7737bd

Files changed (2) hide show
  1. SRT.py +14 -6
  2. pipeline.py +2 -2
SRT.py CHANGED
@@ -190,8 +190,12 @@ class SRT_script():
190
  #print(lines[i])
191
  pass
192
 
193
- def split_seg(self, seg, threshold=500):
194
- # TODO: evenly split seg to 2 parts and add new seg into self.segments
 
 
 
 
195
  source_text = seg.source_text
196
  translation = seg.translation
197
  src_commas = [m.start() for m in re.finditer(',', source_text)]
@@ -200,7 +204,10 @@ class SRT_script():
200
  src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
201
  else:
202
  src_space = [m.start() for m in re.finditer(' ', source_text)]
203
- src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
 
 
 
204
 
205
  if len(trans_commas) != 0:
206
  trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
@@ -242,8 +249,9 @@ class SRT_script():
242
  return result_list
243
 
244
 
245
- def check_len_and_split(self, threshold=30000):
246
- # TODO: if sentence length >= threshold, split this segments to two
 
247
  segments = []
248
  for seg in self.segments:
249
  if len(seg.translation) > threshold:
@@ -257,7 +265,7 @@ class SRT_script():
257
  pass
258
 
259
  def check_len_and_split_range(self, range, threshold=30):
260
- # TODO: if sentence length >= threshold, split this segments to two
261
  start_seg_id = range[0]
262
  end_seg_id = range[1]
263
  extra_len = 0
 
190
  #print(lines[i])
191
  pass
192
 
193
+ def split_seg(self, seg, threshold):
194
+ # evenly split seg to 2 parts and add new seg into self.segments
195
+ if seg.source_text[:2] == ', ':
196
+ seg.source_text = seg.source_text[2:]
197
+ if seg.translation[0] == ',':
198
+ seg.translation = seg.translation[1:]
199
  source_text = seg.source_text
200
  translation = seg.translation
201
  src_commas = [m.start() for m in re.finditer(',', source_text)]
 
204
  src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
205
  else:
206
  src_space = [m.start() for m in re.finditer(' ', source_text)]
207
+ if len(src_space) > 0:
208
+ src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
209
+ else:
210
+ src_split_idx = 0
211
 
212
  if len(trans_commas) != 0:
213
  trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
 
249
  return result_list
250
 
251
 
252
+ def check_len_and_split(self, threshold=30):
253
+ # DEPRECATED
254
+ # if sentence length >= threshold, split this segments to two
255
  segments = []
256
  for seg in self.segments:
257
  if len(seg.translation) > threshold:
 
265
  pass
266
 
267
  def check_len_and_split_range(self, range, threshold=30):
268
+ # if sentence length >= threshold, split this segments to two
269
  start_seg_id = range[0]
270
  end_seg_id = range[1]
271
  extra_len = 0
pipeline.py CHANGED
@@ -261,10 +261,10 @@ for sentence, range in tqdm(zip(script_arr, range_arr)):
261
  time.sleep(30)
262
  flag = True
263
  # add read-time output back and modify the post-processing by using one batch as an unit.
264
- srt.set_translation(translate, range, model_name)
265
 
266
  add_length = srt.check_len_and_split_range(range, threshold)
267
- srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx,args.link)
268
  # srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
269
 
270
  # srt.check_len_and_split()
 
261
  time.sleep(30)
262
  flag = True
263
  # add read-time output back and modify the post-processing by using one batch as an unit.
264
+ srt.set_translation(translate, range, model_name,args.link)
265
 
266
  add_length = srt.check_len_and_split_range(range, threshold)
267
+ srt.realtime_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_zh.srt",range, add_length,segidx)
268
  # srt.realtime_bilingual_write_srt(f"{RESULT_PATH}/{VIDEO_NAME}/{VIDEO_NAME}_bi.srt",range, add_length,segidx)
269
 
270
  # srt.check_len_and_split()