Spaces:
Sleeping
Sleeping
Merge pull request #41 from project-kxkg/Yuhan/merge_sentence_issue
Browse filesYuhan/merge sentence issue
Former-commit-id: 686d8d3b6a2f67550a86b9861e03d4fdf0550549
- pipeline.py +62 -1
pipeline.py
CHANGED
@@ -186,8 +186,33 @@ def script_split(script_in, chunk_size = 1000):
|
|
186 |
# print('temp_contents')
|
187 |
# print(srt.get_source_only())
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
def get_response(model_name, sentence):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
|
192 |
response = openai.ChatCompletion.create(
|
193 |
model=model_name,
|
@@ -208,7 +233,24 @@ def get_response(model_name, sentence):
|
|
208 |
|
209 |
|
210 |
# Translate and save
|
211 |
-
def translate(srt, script_arr, range_arr, model_name, video_name, video_link):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
logging.info("start translating...")
|
213 |
previous_length = 0
|
214 |
for sentence, range in tqdm(zip(script_arr, range_arr)):
|
@@ -223,12 +265,31 @@ def translate(srt, script_arr, range_arr, model_name, video_name, video_link):
|
|
223 |
flag = False
|
224 |
try:
|
225 |
translate = get_response(model_name, sentence)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
except Exception as e:
|
227 |
logging.debug("An error has occurred during translation:",e)
|
228 |
print("An error has occurred during translation:",e)
|
229 |
print("Retrying... the script will continue after 30 seconds.")
|
230 |
time.sleep(30)
|
231 |
flag = True
|
|
|
232 |
srt.set_translation(translate, range, model_name, video_name, video_link)
|
233 |
|
234 |
|
|
|
186 |
# print('temp_contents')
|
187 |
# print(srt.get_source_only())
|
188 |
|
189 |
+
def check_translation(sentence, translation):
|
190 |
+
"""
|
191 |
+
check merge sentence issue from openai translation
|
192 |
+
"""
|
193 |
+
sentence_count = sentence.count('\n\n') + 1
|
194 |
+
translation_count = translation.count('\n\n') + 1
|
195 |
+
|
196 |
+
if sentence_count != translation_count:
|
197 |
+
# print("sentence length: ", len(sentence), sentence_count)
|
198 |
+
# print("translation length: ", len(translation), translation_count)
|
199 |
+
return False
|
200 |
+
else:
|
201 |
+
return True
|
202 |
+
|
203 |
|
204 |
def get_response(model_name, sentence):
|
205 |
+
"""
|
206 |
+
Generates a translated response for a given sentence using a specified OpenAI model.
|
207 |
+
|
208 |
+
Args:
|
209 |
+
model_name (str): The name of the OpenAI model to be used for translation, either "gpt-3.5-turbo" or "gpt-4".
|
210 |
+
sentence (str): The English sentence related to StarCraft 2 videos that needs to be translated into Chinese.
|
211 |
+
|
212 |
+
Returns:
|
213 |
+
str: The translated Chinese sentence, maintaining the original format, meaning, and number of lines.
|
214 |
+
"""
|
215 |
+
|
216 |
if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
|
217 |
response = openai.ChatCompletion.create(
|
218 |
model=model_name,
|
|
|
233 |
|
234 |
|
235 |
# Translate and save
|
236 |
+
def translate(srt, script_arr, range_arr, model_name, video_name, video_link, attempts_count = 5):
|
237 |
+
"""
|
238 |
+
Translates the given script array into another language using the chatgpt and writes to the SRT file.
|
239 |
+
|
240 |
+
This function takes a script array, a range array, a model name, a video name, and a video link as input. It iterates
|
241 |
+
through sentences and range in the script and range arrays. If the translation check fails for five times, the function
|
242 |
+
will attempt to resolve merge sentence issues and split the sentence into smaller tokens for a better translation.
|
243 |
+
|
244 |
+
Args:
|
245 |
+
srt (Subtitle): An instance of the Subtitle class representing the SRT file.
|
246 |
+
script_arr (list): A list of strings representing the original script sentences to be translated.
|
247 |
+
range_arr (list): A list of tuples representing the start and end positions of sentences in the script.
|
248 |
+
model_name (str): The name of the translation model to be used.
|
249 |
+
video_name (str): The name of the video.
|
250 |
+
video_link (str): The link to the video.
|
251 |
+
attempts_count (int): Number of attemps of failures for unmatched sentences.
|
252 |
+
"""
|
253 |
+
|
254 |
logging.info("start translating...")
|
255 |
previous_length = 0
|
256 |
for sentence, range in tqdm(zip(script_arr, range_arr)):
|
|
|
265 |
flag = False
|
266 |
try:
|
267 |
translate = get_response(model_name, sentence)
|
268 |
+
# detect merge sentence issue and try to solve for five times:
|
269 |
+
while not check_translation(sentence, translate) and attempts_count > 0:
|
270 |
+
translate = get_response(model_name, sentence)
|
271 |
+
attempts_count -= 1
|
272 |
+
|
273 |
+
# if failure still happen, split into smaller tokens
|
274 |
+
if attempts_count == 0:
|
275 |
+
single_sentences = sentence.split("\n\n")
|
276 |
+
logging.info("merge sentence issue found for range", range)
|
277 |
+
translate = ""
|
278 |
+
for i, single_sentence in enumerate(single_sentences):
|
279 |
+
if i == len(single_sentences) - 1:
|
280 |
+
translate += get_response(model_name, single_sentence)
|
281 |
+
else:
|
282 |
+
translate += get_response(model_name, single_sentence) + "\n\n"
|
283 |
+
# print(single_sentence, translate.split("\n\n")[-2])
|
284 |
+
logging.info("solved by individually translation!")
|
285 |
+
|
286 |
except Exception as e:
|
287 |
logging.debug("An error has occurred during translation:",e)
|
288 |
print("An error has occurred during translation:",e)
|
289 |
print("Retrying... the script will continue after 30 seconds.")
|
290 |
time.sleep(30)
|
291 |
flag = True
|
292 |
+
|
293 |
srt.set_translation(translate, range, model_name, video_name, video_link)
|
294 |
|
295 |
|