Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

yichenl5 commited on Apr 16, 2023

Commit

28616fd

2 Parent(s): f76455f 090e123

Merge pull request #41 from project-kxkg/Yuhan/merge_sentence_issue

Browse files

Yuhan/merge sentence issue

Former-commit-id: 686d8d3b6a2f67550a86b9861e03d4fdf0550549

Files changed (1) hide show

pipeline.py +62 -1

pipeline.py CHANGED Viewed

@@ -186,8 +186,33 @@ def script_split(script_in, chunk_size = 1000):
 #     print('temp_contents')
 #     print(srt.get_source_only())
 def get_response(model_name, sentence):
     if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
         response = openai.ChatCompletion.create(
             model=model_name,
@@ -208,7 +233,24 @@ def get_response(model_name, sentence):
 # Translate and save
-def translate(srt, script_arr, range_arr, model_name, video_name, video_link):
     logging.info("start translating...")
     previous_length = 0
     for sentence, range in tqdm(zip(script_arr, range_arr)):
@@ -223,12 +265,31 @@ def translate(srt, script_arr, range_arr, model_name, video_name, video_link):
             flag = False
             try:
                 translate = get_response(model_name, sentence)
             except Exception as e:
                 logging.debug("An error has occurred during translation:",e)
                 print("An error has occurred during translation:",e)
                 print("Retrying... the script will continue after 30 seconds.")
                 time.sleep(30)
                 flag = True
         srt.set_translation(translate, range, model_name, video_name, video_link)

 #     print('temp_contents')
 #     print(srt.get_source_only())
+def check_translation(sentence, translation):
+    """
+    check merge sentence issue from openai translation
+    """
+    sentence_count = sentence.count('\n\n') + 1
+    translation_count = translation.count('\n\n') + 1
+    if sentence_count != translation_count:
+        # print("sentence length: ", len(sentence), sentence_count)
+        # print("translation length: ",  len(translation), translation_count)
+        return False
+    else:
+        return True
 def get_response(model_name, sentence):
+    """
+    Generates a translated response for a given sentence using a specified OpenAI model.
+    Args:
+    model_name (str): The name of the OpenAI model to be used for translation, either "gpt-3.5-turbo" or "gpt-4".
+    sentence (str): The English sentence related to StarCraft 2 videos that needs to be translated into Chinese.
+    Returns:
+    str: The translated Chinese sentence, maintaining the original format, meaning, and number of lines.
+    """
     if model_name == "gpt-3.5-turbo" or model_name == "gpt-4":
         response = openai.ChatCompletion.create(
             model=model_name,
 # Translate and save
+def translate(srt, script_arr, range_arr, model_name, video_name, video_link, attempts_count = 5):
+    """
+    Translates the given script array into another language using the chatgpt and writes to the SRT file.
+    This function takes a script array, a range array, a model name, a video name, and a video link as input. It iterates
+    through sentences and range in the script and range arrays. If the translation check fails for five times, the function
+    will attempt to resolve merge sentence issues and split the sentence into smaller tokens for a better translation.
+    Args:
+    srt (Subtitle): An instance of the Subtitle class representing the SRT file.
+    script_arr (list): A list of strings representing the original script sentences to be translated.
+    range_arr (list): A list of tuples representing the start and end positions of sentences in the script.
+    model_name (str): The name of the translation model to be used.
+    video_name (str): The name of the video.
+    video_link (str): The link to the video.
+    attempts_count (int): Number of attemps of failures for unmatched sentences.
+    """
     logging.info("start translating...")
     previous_length = 0
     for sentence, range in tqdm(zip(script_arr, range_arr)):
             flag = False
             try:
                 translate = get_response(model_name, sentence)
+                # detect merge sentence issue and try to solve for five times:
+                while not check_translation(sentence, translate) and attempts_count > 0:
+                    translate = get_response(model_name, sentence)
+                    attempts_count -= 1
+                # if failure still happen, split into smaller tokens
+                if attempts_count == 0:
+                    single_sentences = sentence.split("\n\n")
+                    logging.info("merge sentence issue found for range", range)
+                    translate = ""
+                    for i, single_sentence in enumerate(single_sentences):
+                        if i == len(single_sentences) - 1:
+                            translate += get_response(model_name, single_sentence)
+                        else:
+                            translate += get_response(model_name, single_sentence) + "\n\n"
+                            # print(single_sentence, translate.split("\n\n")[-2])
+                    logging.info("solved by individually translation!")
             except Exception as e:
                 logging.debug("An error has occurred during translation:",e)
                 print("An error has occurred during translation:",e)
                 print("Retrying... the script will continue after 30 seconds.")
                 time.sleep(30)
                 flag = True
         srt.set_translation(translate, range, model_name, video_name, video_link)