Spaces:

StarPigeon
/

ViDove

Sleeping

@@ -11,42 +11,42 @@ from tqdm import tqdm
 # punctuation dictionary for supported languages
 punctuation_dict = {
     "EN": {
-        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \"",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";"]
     },
     "ES": {
-        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \" ¡ ¿",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";", "¡", "¿"]
     },
     "FR": {
-        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \" « » —",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";"]
     },
     "DE": {
-        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \" „ “ –",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";"]
     },
     "RU": {
-        "punc_str": ". , ? ! : ; - ( ) [ ] { } ' \" « » —",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";"]
     },
     "ZH": {
-        "punc_str": "。 ， ？ ！ ： ； — （ ） &#8203;``【oaicite:1】``&#8203; 《 》 “ ”",
         "comma": "，",
         "sentence_end": ["。", "！", "？"]
     },
     "JA": {
-        "punc_str": "。 、 ？ ！ ： ； ー （ ） &#8203;``【oaicite:0】``&#8203; 「 」 『 』",
         "comma": "、",
         "sentence_end": ["。", "！", "？"]
     },
     "AR": {
-        "punc_str": ". , ? ! : ; - ( ) [ ] { } ، ؛ ؟ « »",
         "comma": "، ",
         "sentence_end": [".", "!", "?", ";", "؟"]
     },
@@ -100,6 +100,7 @@ class SrtSegment(object):
                 self.translation = ""
             else:
                 self.translation = args[0][3]
     def merge_seg(self, seg):
         """
@@ -132,9 +133,11 @@ class SrtSegment(object):
         remove punctuations in translation text
         :return: None
         """
-        punc = punctuation_dict[self.tgt_lang]["punc_str"]
-        translator = str.maketrans(punc, ' ' * len(punc))
-        self.translation = self.translation.translate(translator)
     def __str__(self) -> str:
         return f'{self.duration}\n{self.source_text}\n\n'
@@ -233,19 +236,20 @@ class SrtScript(object):
             src_text += '\n\n'
         def inner_func(target, input_str):
-            # TODO: accomodate different languages
             response = openai.ChatCompletion.create(
                 model="gpt-4",
                 messages=[
                     {"role": "system",
-                     "content": "你的任务是按照要求合并或拆分句子到指定行数，你需要尽可能保证句意，但必要时可以将一句话分为两行输出"},
-                    {"role": "system", "content": "注意：你只需要输出处理过的中文句子，如果你要输出序号，请使用冒号隔开"},
-                    {"role": "user", "content": '请将下面的句子拆分或组合为{}句:\n{}'.format(target, input_str)}
                 ],
                 temperature=0.15
             )
             return response['choices'][0]['message']['content'].strip()
         lines = translate.split('\n\n')
         if len(lines) < (end_seg_id - start_seg_id + 1):
             count = 0
@@ -253,6 +257,7 @@ class SrtScript(object):
             while count < 5 and len(lines) != (end_seg_id - start_seg_id + 1):
                 count += 1
                 print("Solving Unmatched Lines|iteration {}".format(count))
                 flag = True
                 while flag:
@@ -262,13 +267,17 @@ class SrtScript(object):
                     except Exception as e:
                         print("An error has occurred during solving unmatched lines:", e)
                         print("Retrying...")
                         flag = True
                 lines = translate.split('\n')
             if len(lines) < (end_seg_id - start_seg_id + 1):
                 solved = False
                 print("Failed Solving unmatched lines, Manually parse needed")
             if not os.path.exists("./logs"):
                 os.mkdir("./logs")
             if video_link:
@@ -287,7 +296,7 @@ class SrtScript(object):
                         log.write("range_of_text,iterations_solving,solved,file_length,video_name" + "\n")
                     log.write(str(id_range) + ',' + str(count) + ',' + str(solved) + ',' + str(
                         len(self.segments)) + ',' + video_name + "\n")
-            print(lines)
         for i, seg in enumerate(self.segments[start_seg_id - 1:end_seg_id]):
             # naive way to due with merge translation problem
@@ -337,19 +346,13 @@ class SrtScript(object):
             trans_split_idx = trans_commas[len(trans_commas) // 2] if len(trans_commas) % 2 == 1 else trans_commas[
                 len(trans_commas) // 2 - 1]
         else:
-            # split the text based on spaces
-            trans_space = [m.start() for m in re.finditer(' ', translation)]
-            if len(trans_space) > 0:
-                trans_split_idx = trans_space[len(trans_space) // 2] if len(trans_space) % 2 == 1 else trans_space[
-                    len(trans_space) // 2 - 1]
-            else:
-                trans_split_idx = len(translation) // 2
-                # to avoid split English word
-                for i in range(trans_split_idx, len(translation)):
-                    if not translation[i].encode('utf-8').isalpha():
-                        trans_split_idx = i
-                        break
         # split the time duration based on text length
         time_split_ratio = trans_split_idx / (len(seg.translation) - 1)
@@ -405,8 +408,6 @@ class SrtScript(object):
         self.segments = segments
         logging.info("check_len_and_split finished")
-        pass
     def check_len_and_split_range(self, range, text_threshold=30, time_threshold=1.0):
         # DEPRECATED
         # if sentence length >= text_threshold, split this segments to two

 # punctuation dictionary for supported languages
 punctuation_dict = {
     "EN": {
+        "punc_str": ". , ? ! : ; - ( ) [ ] { }",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";"]
     },
     "ES": {
+        "punc_str": ". , ? ! : ; - ( ) [ ] { } ¡ ¿",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";", "¡", "¿"]
     },
     "FR": {
+        "punc_str": ".,?!:;«»—",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";"]
     },
     "DE": {
+        "punc_str": ".,?!:;„“–",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";"]
     },
     "RU": {
+        "punc_str": ".,?!:;-«»—",
         "comma": ", ",
         "sentence_end": [".", "!", "?", ";"]
     },
     "ZH": {
+        "punc_str": "。，？！：；（）",
         "comma": "，",
         "sentence_end": ["。", "！", "？"]
     },
     "JA": {
+        "punc_str": "。、？！：；（）",
         "comma": "、",
         "sentence_end": ["。", "！", "？"]
     },
     "AR": {
+        "punc_str": ".,?!:;-()[]،؛ ؟ «»",
         "comma": "، ",
         "sentence_end": [".", "!", "?", ";", "؟"]
     },
                 self.translation = ""
             else:
                 self.translation = args[0][3]
     def merge_seg(self, seg):
         """
         remove punctuations in translation text
         :return: None
         """
+        punc_str = punctuation_dict[self.tgt_lang]["punc_str"]
+        for punc in punc_str:
+            self.translation = self.translation.replace(punc, ' ')
+        # translator = str.maketrans(punc, ' ' * len(punc))
+        # self.translation = self.translation.translate(translator)
     def __str__(self) -> str:
         return f'{self.duration}\n{self.source_text}\n\n'
             src_text += '\n\n'
         def inner_func(target, input_str):
+            # handling merge sentences issue.
             response = openai.ChatCompletion.create(
                 model="gpt-4",
                 messages=[
                     {"role": "system",
+                     "content": "Your task is to merge or split sentences into a specified number of lines as required. You need to ensure the meaning of the sentences as much as possible, but when necessary, a sentence can be divided into two lines for output"},
+                    {"role": "system", "content": "Note: You only need to output the processed {} sentences. If you need to output a sequence number, please separate it with a colon.".format(self.tgt_lang)},
+                    {"role": "user", "content": 'Please split or combine the following sentences into {} sentences:\n{}'.format(target, input_str)}
                 ],
                 temperature=0.15
             )
             return response['choices'][0]['message']['content'].strip()
+        # handling merge sentences issue.
         lines = translate.split('\n\n')
         if len(lines) < (end_seg_id - start_seg_id + 1):
             count = 0
             while count < 5 and len(lines) != (end_seg_id - start_seg_id + 1):
                 count += 1
                 print("Solving Unmatched Lines|iteration {}".format(count))
+                logging.error("Solving Unmatched Lines|iteration {}".format(count))
                 flag = True
                 while flag:
                     except Exception as e:
                         print("An error has occurred during solving unmatched lines:", e)
                         print("Retrying...")
+                        logging.error("An error has occurred during solving unmatched lines:", e)
+                        logging.error("Retrying...")
                         flag = True
                 lines = translate.split('\n')
             if len(lines) < (end_seg_id - start_seg_id + 1):
                 solved = False
                 print("Failed Solving unmatched lines, Manually parse needed")
+                logging.error("Failed Solving unmatched lines, Manually parse needed")
+            # FIXME: put the error log in our log file
             if not os.path.exists("./logs"):
                 os.mkdir("./logs")
             if video_link:
                         log.write("range_of_text,iterations_solving,solved,file_length,video_name" + "\n")
                     log.write(str(id_range) + ',' + str(count) + ',' + str(solved) + ',' + str(
                         len(self.segments)) + ',' + video_name + "\n")
+            # print(lines)
         for i, seg in enumerate(self.segments[start_seg_id - 1:end_seg_id]):
             # naive way to due with merge translation problem
             trans_split_idx = trans_commas[len(trans_commas) // 2] if len(trans_commas) % 2 == 1 else trans_commas[
                 len(trans_commas) // 2 - 1]
         else:
+            trans_split_idx = len(translation) // 2
+            # to avoid split English word
+            for i in range(trans_split_idx, len(translation)):
+                if not translation[i].encode('utf-8').isalpha():
+                    trans_split_idx = i
+                    break
         # split the time duration based on text length
         time_split_ratio = trans_split_idx / (len(seg.translation) - 1)
         self.segments = segments
         logging.info("check_len_and_split finished")
     def check_len_and_split_range(self, range, text_threshold=30, time_threshold=1.0):
         # DEPRECATED
         # if sentence length >= text_threshold, split this segments to two

tests/test_remove_punc.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import sys
+sys.path.append('./src')
+from srt_util.srt import SrtScript, SrtSegment
+zh_test1 = "再次，如果你对一些福利感兴趣，你也可以。"
+zh_en_test1 = "GG。Classic在我今年解说的最奇葩的系列赛中获得了胜利。"
+def form_srt_class(src_lang, tgt_lang, source_text="", translation="", duration="00:00:00,740 --> 00:00:08,779"):
+    segment = [0, duration, source_text, translation, ""]
+    return SrtScript(src_lang, tgt_lang, [segment])
+def test_zh():
+    srt = form_srt_class(src_lang="EN", tgt_lang="ZH", translation=zh_test1)
+    srt.remove_trans_punctuation()
+    assert srt.segments[0].translation == "再次 如果你对一些福利感兴趣 你也可以 "
+def test_zh_en():
+    srt = form_srt_class(src_lang="EN", tgt_lang="ZH", translation=zh_en_test1)
+    srt.remove_trans_punctuation()
+    assert srt.segments[0].translation == "GG Classic在我今年解说的最奇葩的系列赛中获得了胜利 "