Spaces:
Sleeping
Sleeping
worldqwq
commited on
Commit
·
c0481ec
1
Parent(s):
96fb84a
Added log.csv creation
Browse filesFormer-commit-id: 736ed1428a98e09c934c9ab1e255e7b054d46548
SRT.py
CHANGED
@@ -3,6 +3,7 @@ from csv import reader
|
|
3 |
from datetime import datetime
|
4 |
import re
|
5 |
import openai
|
|
|
6 |
from collections import deque
|
7 |
|
8 |
class SRT_segment(object):
|
@@ -62,16 +63,6 @@ class SRT_segment(object):
|
|
62 |
def get_bilingual_str(self) -> str:
|
63 |
return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
|
64 |
|
65 |
-
# def set_translation(self, trans):
|
66 |
-
# if trans[0] == ',':
|
67 |
-
# trans = trans[1:]
|
68 |
-
# self.translation = trans
|
69 |
-
|
70 |
-
# def set_src_text(self, src_text):
|
71 |
-
# if src_text[0] == ',':
|
72 |
-
# src_text = src_text[1:]
|
73 |
-
# self.source_text = src_text
|
74 |
-
|
75 |
class SRT_script():
|
76 |
def __init__(self, segments) -> None:
|
77 |
self.segments = []
|
@@ -120,7 +111,7 @@ class SRT_script():
|
|
120 |
|
121 |
|
122 |
|
123 |
-
def set_translation(self, translate:str, id_range:tuple, model):
|
124 |
start_seg_id = id_range[0]
|
125 |
end_seg_id = id_range[1]
|
126 |
|
@@ -140,6 +131,7 @@ class SRT_script():
|
|
140 |
lines = translate.split('\n\n')
|
141 |
if len(lines) < (end_seg_id - start_seg_id + 1):
|
142 |
count = 0
|
|
|
143 |
while count<5 and len(lines) != (end_seg_id - start_seg_id + 1):
|
144 |
|
145 |
count += 1
|
@@ -167,6 +159,13 @@ class SRT_script():
|
|
167 |
if len(lines) < (end_seg_id - start_seg_id + 1):
|
168 |
print("Failed Solving unmatched lines, Manually parse needed")
|
169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
print(lines)
|
171 |
#print(id_range)
|
172 |
#for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
|
@@ -191,12 +190,8 @@ class SRT_script():
|
|
191 |
#print(lines[i])
|
192 |
pass
|
193 |
|
194 |
-
def split_seg(self, seg, threshold):
|
195 |
-
# evenly split seg to 2 parts and add new seg into self.segments
|
196 |
-
if seg.source_text[:2] == ', ':
|
197 |
-
seg.source_text = seg.source_text[2:]
|
198 |
-
if seg.translation[0] == ',':
|
199 |
-
seg.translation = seg.translation[1:]
|
200 |
source_text = seg.source_text
|
201 |
translation = seg.translation
|
202 |
src_commas = [m.start() for m in re.finditer(',', source_text)]
|
@@ -205,10 +200,7 @@ class SRT_script():
|
|
205 |
src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
|
206 |
else:
|
207 |
src_space = [m.start() for m in re.finditer(' ', source_text)]
|
208 |
-
if len(src_space)
|
209 |
-
src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
|
210 |
-
else:
|
211 |
-
src_split_idx = 0
|
212 |
|
213 |
if len(trans_commas) != 0:
|
214 |
trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
|
@@ -250,9 +242,8 @@ class SRT_script():
|
|
250 |
return result_list
|
251 |
|
252 |
|
253 |
-
def check_len_and_split(self, threshold=
|
254 |
-
#
|
255 |
-
# if sentence length >= threshold, split this segments to two
|
256 |
segments = []
|
257 |
for seg in self.segments:
|
258 |
if len(seg.translation) > threshold:
|
@@ -266,7 +257,7 @@ class SRT_script():
|
|
266 |
pass
|
267 |
|
268 |
def check_len_and_split_range(self, range, threshold=30):
|
269 |
-
# if sentence length >= threshold, split this segments to two
|
270 |
start_seg_id = range[0]
|
271 |
end_seg_id = range[1]
|
272 |
extra_len = 0
|
|
|
3 |
from datetime import datetime
|
4 |
import re
|
5 |
import openai
|
6 |
+
import os
|
7 |
from collections import deque
|
8 |
|
9 |
class SRT_segment(object):
|
|
|
63 |
def get_bilingual_str(self) -> str:
|
64 |
return f'{self.duration}\n{self.source_text}\n{self.translation}\n\n'
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
class SRT_script():
|
67 |
def __init__(self, segments) -> None:
|
68 |
self.segments = []
|
|
|
111 |
|
112 |
|
113 |
|
114 |
+
def set_translation(self, translate:str, id_range:tuple, model,vid_link=None):
|
115 |
start_seg_id = id_range[0]
|
116 |
end_seg_id = id_range[1]
|
117 |
|
|
|
131 |
lines = translate.split('\n\n')
|
132 |
if len(lines) < (end_seg_id - start_seg_id + 1):
|
133 |
count = 0
|
134 |
+
solved = False
|
135 |
while count<5 and len(lines) != (end_seg_id - start_seg_id + 1):
|
136 |
|
137 |
count += 1
|
|
|
159 |
if len(lines) < (end_seg_id - start_seg_id + 1):
|
160 |
print("Failed Solving unmatched lines, Manually parse needed")
|
161 |
|
162 |
+
log_file = "log.csv"
|
163 |
+
log_exist = os.path.exists(log_file)
|
164 |
+
with open(log_file,"a") as log:
|
165 |
+
if not log_exist:
|
166 |
+
log.write("range_of_text,content_range,iterations_solving,solved,file_length,video_link")
|
167 |
+
log.write(range+','+range+','+count+','+solved+','+len(self.segments)+','+vid_link)
|
168 |
+
|
169 |
print(lines)
|
170 |
#print(id_range)
|
171 |
#for i, seg in enumerate(self.segments[start_seg_id-1:end_seg_id]):
|
|
|
190 |
#print(lines[i])
|
191 |
pass
|
192 |
|
193 |
+
def split_seg(self, seg, threshold=500):
|
194 |
+
# TODO: evenly split seg to 2 parts and add new seg into self.segments
|
|
|
|
|
|
|
|
|
195 |
source_text = seg.source_text
|
196 |
translation = seg.translation
|
197 |
src_commas = [m.start() for m in re.finditer(',', source_text)]
|
|
|
200 |
src_split_idx = src_commas[len(src_commas)//2] if len(src_commas) % 2 == 1 else src_commas[len(src_commas)//2 - 1]
|
201 |
else:
|
202 |
src_space = [m.start() for m in re.finditer(' ', source_text)]
|
203 |
+
src_split_idx = src_space[len(src_space)//2] if len(src_space) % 2 == 1 else src_space[len(src_space)//2 - 1]
|
|
|
|
|
|
|
204 |
|
205 |
if len(trans_commas) != 0:
|
206 |
trans_split_idx = trans_commas[len(trans_commas)//2] if len(trans_commas) % 2 == 1 else trans_commas[len(trans_commas)//2 - 1]
|
|
|
242 |
return result_list
|
243 |
|
244 |
|
245 |
+
def check_len_and_split(self, threshold=30000):
|
246 |
+
# TODO: if sentence length >= threshold, split this segments to two
|
|
|
247 |
segments = []
|
248 |
for seg in self.segments:
|
249 |
if len(seg.translation) > threshold:
|
|
|
257 |
pass
|
258 |
|
259 |
def check_len_and_split_range(self, range, threshold=30):
|
260 |
+
# TODO: if sentence length >= threshold, split this segments to two
|
261 |
start_seg_id = range[0]
|
262 |
end_seg_id = range[1]
|
263 |
extra_len = 0
|