yichenl5 commited on
Commit
28616fd
·
2 Parent(s): f76455f 090e123

Merge pull request #41 from project-kxkg/Yuhan/merge_sentence_issue

Browse files

Yuhan/merge sentence issue

Former-commit-id: 686d8d3b6a2f67550a86b9861e03d4fdf0550549

Files changed (1) hide show
  1. pipeline.py +62 -1
pipeline.py CHANGED
@@ -186,8 +186,33 @@ def script_split(script_in, chunk_size = 1000):
186
  # print('temp_contents')
187
  # print(srt.get_source_only())
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  def get_response(model_name, sentence):
 
 
 
 
 
 
 
 
 
 
 
191
  if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
192
  response = openai.ChatCompletion.create(
193
  model=model_name,
@@ -208,7 +233,24 @@ def get_response(model_name, sentence):
208
 
209
 
210
  # Translate and save
211
- def translate(srt, script_arr, range_arr, model_name, video_name, video_link):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  logging.info("start translating...")
213
  previous_length = 0
214
  for sentence, range in tqdm(zip(script_arr, range_arr)):
@@ -223,12 +265,31 @@ def translate(srt, script_arr, range_arr, model_name, video_name, video_link):
223
  flag = False
224
  try:
225
  translate = get_response(model_name, sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  except Exception as e:
227
  logging.debug("An error has occurred during translation:",e)
228
  print("An error has occurred during translation:",e)
229
  print("Retrying... the script will continue after 30 seconds.")
230
  time.sleep(30)
231
  flag = True
 
232
  srt.set_translation(translate, range, model_name, video_name, video_link)
233
 
234
 
 
186
  # print('temp_contents')
187
  # print(srt.get_source_only())
188
 
189
+ def check_translation(sentence, translation):
190
+ """
191
+ check merge sentence issue from openai translation
192
+ """
193
+ sentence_count = sentence.count('\n\n') + 1
194
+ translation_count = translation.count('\n\n') + 1
195
+
196
+ if sentence_count != translation_count:
197
+ # print("sentence length: ", len(sentence), sentence_count)
198
+ # print("translation length: ", len(translation), translation_count)
199
+ return False
200
+ else:
201
+ return True
202
+
203
 
204
  def get_response(model_name, sentence):
205
+ """
206
+ Generates a translated response for a given sentence using a specified OpenAI model.
207
+
208
+ Args:
209
+ model_name (str): The name of the OpenAI model to be used for translation, either "gpt-3.5-turbo" or "gpt-4".
210
+ sentence (str): The English sentence related to StarCraft 2 videos that needs to be translated into Chinese.
211
+
212
+ Returns:
213
+ str: The translated Chinese sentence, maintaining the original format, meaning, and number of lines.
214
+ """
215
+
216
  if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
217
  response = openai.ChatCompletion.create(
218
  model=model_name,
 
233
 
234
 
235
  # Translate and save
236
+ def translate(srt, script_arr, range_arr, model_name, video_name, video_link, attempts_count = 5):
237
+ """
238
+ Translates the given script array into another language using the chatgpt and writes to the SRT file.
239
+
240
+ This function takes a script array, a range array, a model name, a video name, and a video link as input. It iterates
241
+ through sentences and range in the script and range arrays. If the translation check fails for five times, the function
242
+ will attempt to resolve merge sentence issues and split the sentence into smaller tokens for a better translation.
243
+
244
+ Args:
245
+ srt (Subtitle): An instance of the Subtitle class representing the SRT file.
246
+ script_arr (list): A list of strings representing the original script sentences to be translated.
247
+ range_arr (list): A list of tuples representing the start and end positions of sentences in the script.
248
+ model_name (str): The name of the translation model to be used.
249
+ video_name (str): The name of the video.
250
+ video_link (str): The link to the video.
251
+ attempts_count (int): Number of attemps of failures for unmatched sentences.
252
+ """
253
+
254
  logging.info("start translating...")
255
  previous_length = 0
256
  for sentence, range in tqdm(zip(script_arr, range_arr)):
 
265
  flag = False
266
  try:
267
  translate = get_response(model_name, sentence)
268
+ # detect merge sentence issue and try to solve for five times:
269
+ while not check_translation(sentence, translate) and attempts_count > 0:
270
+ translate = get_response(model_name, sentence)
271
+ attempts_count -= 1
272
+
273
+ # if failure still happen, split into smaller tokens
274
+ if attempts_count == 0:
275
+ single_sentences = sentence.split("\n\n")
276
+ logging.info("merge sentence issue found for range", range)
277
+ translate = ""
278
+ for i, single_sentence in enumerate(single_sentences):
279
+ if i == len(single_sentences) - 1:
280
+ translate += get_response(model_name, single_sentence)
281
+ else:
282
+ translate += get_response(model_name, single_sentence) + "\n\n"
283
+ # print(single_sentence, translate.split("\n\n")[-2])
284
+ logging.info("solved by individually translation!")
285
+
286
  except Exception as e:
287
  logging.debug("An error has occurred during translation:",e)
288
  print("An error has occurred during translation:",e)
289
  print("Retrying... the script will continue after 30 seconds.")
290
  time.sleep(30)
291
  flag = True
292
+
293
  srt.set_translation(translate, range, model_name, video_name, video_link)
294
 
295