Macrodove commited on
Commit
54a5e67
·
1 Parent(s): 05114c4

translation module implementation

Browse files

Former-commit-id: e6b1d4b73c063aca65f510a23fb4ead6eccea1c1

src/srt_util/srt.py CHANGED
@@ -532,3 +532,27 @@ class SrtScript(object):
532
  f.write(f'{i + idx}\n')
533
  f.write(seg.get_bilingual_str())
534
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  f.write(f'{i + idx}\n')
533
  f.write(seg.get_bilingual_str())
534
  pass
535
+
536
+ def split_script(script_in, chunk_size=1000):
537
+ script_split = script_in.split('\n\n')
538
+ script_arr = []
539
+ range_arr = []
540
+ start = 1
541
+ end = 0
542
+ script = ""
543
+ for sentence in script_split:
544
+ if len(script) + len(sentence) + 1 <= chunk_size:
545
+ script += sentence + '\n\n'
546
+ end += 1
547
+ else:
548
+ range_arr.append((start, end))
549
+ start = end + 1
550
+ end += 1
551
+ script_arr.append(script.strip())
552
+ script = sentence + '\n\n'
553
+ if script.strip():
554
+ script_arr.append(script.strip())
555
+ range_arr.append((start, len(script_split) - 1))
556
+
557
+ assert len(script_arr) == len(range_arr)
558
+ return script_arr, range_arr
src/task.py CHANGED
@@ -10,6 +10,9 @@ import logging
10
  import subprocess
11
  from src.srt_util.srt import SrtScript
12
  from src.srt_util.srt2ass import srt2ass
 
 
 
13
 
14
  """
15
  Youtube link
@@ -137,7 +140,8 @@ class Task:
137
 
138
  # Module 3: perform srt translation
139
  def translation(self):
140
- time.sleep(5)
 
141
  pass
142
 
143
  # Module 4: perform srt post process steps
 
10
  import subprocess
11
  from src.srt_util.srt import SrtScript
12
  from src.srt_util.srt2ass import srt2ass
13
+ from translation.translation import get_translation, translate
14
+
15
+
16
 
17
  """
18
  Youtube link
 
140
 
141
  # Module 3: perform srt translation
142
  def translation(self):
143
+ logging.info("---------------------Start Translation--------------------")
144
+ get_translation(self.srt,self.model, self.video_name, self.video_link)
145
  pass
146
 
147
  # Module 4: perform srt post process steps
src/translation/LLM_task.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import openai
3
+
4
+
5
+ def LLM_task(model_name, input, task, temp = 0.15):
6
+ """
7
+ Translates input sentence with desired LLM.
8
+
9
+ :param model_name: The name of the translation model to be used.
10
+ :param input: Sentence for translation.
11
+ :param task: Prompt.
12
+ :param temp: Model temperature.
13
+ """
14
+ if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
15
+ response = openai.ChatCompletion.create(
16
+ model=model_name,
17
+ messages=[
18
+ {"role": "system","content": task},
19
+ {"role": "user", "content": input}
20
+ ],
21
+ temperature=temp
22
+ )
23
+ return response['choices'][0]['message']['content'].strip()
24
+ # Other LLM not implemented
25
+ else:
26
+ raise NotImplementedError
src/translation/translation.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import getenv
2
+ import logging
3
+ from time import sleep
4
+ from tqdm import tqdm
5
+ from src.srt_util.srt import split_script
6
+ from LLM_task import LLM_task
7
+
8
+ def get_translation(srt,model,video_name,video_link):
9
+ script_arr, range_arr = split_script(srt)
10
+ translate(srt, script_arr, range_arr, model, video_name, video_link)
11
+ pass
12
+
13
+ def check_translation(sentence, translation):
14
+ """
15
+ check merge sentence issue from openai translation
16
+ """
17
+ sentence_count = sentence.count('\n\n') + 1
18
+ translation_count = translation.count('\n\n') + 1
19
+
20
+ if sentence_count != translation_count:
21
+ # print("sentence length: ", len(sentence), sentence_count)
22
+ # print("translation length: ", len(translation), translation_count)
23
+ return False
24
+ else:
25
+ return True
26
+
27
+
28
+ def translate(srt, script_arr, range_arr, model_name, video_name, video_link, attempts_count=5, task=None, temp = 0.15):
29
+ """
30
+ Translates the given script array into another language using the chatgpt and writes to the SRT file.
31
+
32
+ This function takes a script array, a range array, a model name, a video name, and a video link as input. It iterates
33
+ through sentences and range in the script and range arrays. If the translation check fails for five times, the function
34
+ will attempt to resolve merge sentence issues and split the sentence into smaller tokens for a better translation.
35
+
36
+ :param srt: An instance of the Subtitle class representing the SRT file.
37
+ :param script_arr: A list of strings representing the original script sentences to be translated.
38
+ :param range_arr: A list of tuples representing the start and end positions of sentences in the script.
39
+ :param model_name: The name of the translation model to be used.
40
+ :param video_name: The name of the video.
41
+ :param video_link: The link to the video.
42
+ :param attempts_count: Number of attemps of failures for unmatched sentences.
43
+ :param task: Prompt.
44
+ :param temp: Model temperature.
45
+ """
46
+ #logging.info("Start translating...")
47
+ if input is None:
48
+ raise Exception("Warning! No Input have passed to LLM!")
49
+ if task is None:
50
+ task = "你是一个翻译助理,你的任务是翻译星际争霸视频,你会被提供一个按行分割的英文段落,你需要在保证句意和行数的情况下输出翻译后的文本。"
51
+
52
+ previous_length = 0
53
+ for sentence, range_ in tqdm(zip(script_arr, range_arr)):
54
+ # update the range based on previous length
55
+ range_ = (range_[0] + previous_length, range_[1] + previous_length)
56
+ # using chatgpt model
57
+ print(f"now translating sentences {range_}")
58
+ #logging.info(f"now translating sentences {range_}, time: {datetime.now()}")
59
+ flag = True
60
+ while flag:
61
+ flag = False
62
+ try:
63
+ translate = LLM_task(model_name, sentence)
64
+ # detect merge sentence issue and try to solve for five times:
65
+ while not check_translation(sentence, translate) and attempts_count > 0:
66
+ translate = LLM_task(model_name,sentence,task,temp)
67
+ attempts_count -= 1
68
+
69
+ # if failure still happen, split into smaller tokens
70
+ if attempts_count == 0:
71
+ single_sentences = sentence.split("\n\n")
72
+ logging.info("merge sentence issue found for range", range_)
73
+ translate = ""
74
+ for i, single_sentence in enumerate(single_sentences):
75
+ if i == len(single_sentences) - 1:
76
+ translate += LLM_task(model_name,sentence,task,temp)
77
+ else:
78
+ translate += LLM_task(model_name,sentence,task,temp) + "\n\n"
79
+ logging.info("solved by individually translation!")
80
+
81
+ except Exception as e:
82
+ logging.debug("An error has occurred during translation:", e)
83
+ print("An error has occurred during translation:", e)
84
+ print("Retrying... the script will continue after 30 seconds.")
85
+ sleep(30)
86
+ flag = True
87
+
88
+ srt.set_translation(translate, range_, model_name, video_name, video_link)