diff --git "a/app_rvc.py" "b/app_rvc.py"
--- "a/app_rvc.py"
+++ "b/app_rvc.py"
@@ -1,2924 +1,2864 @@
-import gradio as gr
-import os
-os.system("pip install -q piper-tts==1.2.0")
-os.system("pip install -q -r requirements_xtts.txt")
-os.system("pip install -q TTS==0.21.1  --no-deps")
-import spaces
-import torch
-if os.environ.get("ZERO_GPU") != "TRUE" and torch.cuda.is_available():
-    # onnxruntime GPU
-    os.system("pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/")
-import librosa
-from soni_translate.logging_setup import (
-    logger,
-    set_logging_level,
-    configure_logging_libs,
-); configure_logging_libs() # noqa
-import whisperx
-import os
-from soni_translate.audio_segments import create_translated_audio
-from soni_translate.text_to_speech import (
-    audio_segmentation_to_voice,
-    edge_tts_voices_list,
-    coqui_xtts_voices_list,
-    piper_tts_voices_list,
-    create_wav_file_vc,
-    accelerate_segments,
-)
-from soni_translate.translate_segments import (
-    translate_text,
-    TRANSLATION_PROCESS_OPTIONS,
-    DOCS_TRANSLATION_PROCESS_OPTIONS
-)
-from soni_translate.preprocessor import (
-    audio_video_preprocessor,
-    audio_preprocessor,
-)
-from soni_translate.postprocessor import (
-    OUTPUT_TYPE_OPTIONS,
-    DOCS_OUTPUT_TYPE_OPTIONS,
-    sound_separate,
-    get_no_ext_filename,
-    media_out,
-    get_subtitle_speaker,
-)
-from soni_translate.language_configuration import (
-    LANGUAGES,
-    UNIDIRECTIONAL_L_LIST,
-    LANGUAGES_LIST,
-    BARK_VOICES_LIST,
-    VITS_VOICES_LIST,
-    OPENAI_TTS_MODELS,
-)
-from soni_translate.utils import (
-    remove_files,
-    download_list,
-    upload_model_list,
-    download_manager,
-    run_command,
-    is_audio_file,
-    is_subtitle_file,
-    copy_files,
-    get_valid_files,
-    get_link_list,
-    remove_directory_contents,
-)
-from soni_translate.mdx_net import (
-    UVR_MODELS,
-    MDX_DOWNLOAD_LINK,
-    mdxnet_models_dir,
-)
-from soni_translate.speech_segmentation import (
-    ASR_MODEL_OPTIONS,
-    COMPUTE_TYPE_GPU,
-    COMPUTE_TYPE_CPU,
-    find_whisper_models,
-    transcribe_speech,
-    align_speech,
-    diarize_speech,
-    diarization_models,
-)
-from soni_translate.text_multiformat_processor import (
-    BORDER_COLORS,
-    srt_file_to_segments,
-    document_preprocessor,
-    determine_chunk_size,
-    plain_text_to_segments,
-    segments_to_plain_text,
-    process_subtitles,
-    linguistic_level_segments,
-    break_aling_segments,
-    doc_to_txtximg_pages,
-    page_data_to_segments,
-    update_page_data,
-    fix_timestamps_docs,
-    create_video_from_images,
-    merge_video_and_audio,
-)
-from soni_translate.languages_gui import language_data, news
-import copy
-import logging
-import json
-from pydub import AudioSegment
-from voice_main import ClassVoices
-import argparse
-import time
-import hashlib
-import sys
-
-directories = [
-    "downloads",
-    "logs",
-    "weights",
-    "clean_song_output",
-    "_XTTS_",
-    f"audio2{os.sep}audio",
-    "audio",
-    "outputs",
-]
-[
-    os.makedirs(directory)
-    for directory in directories
-    if not os.path.exists(directory)
-]
-
-
-class TTS_Info:
-    def __init__(self, piper_enabled, xtts_enabled):
-        self.list_edge = edge_tts_voices_list()
-        self.list_bark = list(BARK_VOICES_LIST.keys())
-        self.list_vits = list(VITS_VOICES_LIST.keys())
-        self.list_openai_tts = OPENAI_TTS_MODELS
-        self.piper_enabled = piper_enabled
-        self.list_vits_onnx = (
-            piper_tts_voices_list() if self.piper_enabled else []
-        )
-        self.xtts_enabled = xtts_enabled
-
-    def tts_list(self):
-        self.list_coqui_xtts = (
-            coqui_xtts_voices_list() if self.xtts_enabled else []
-        )
-        list_tts = self.list_coqui_xtts + sorted(
-            self.list_edge
-            + (self.list_bark if os.environ.get("ZERO_GPU") != "TRUE" else [])
-            + self.list_vits
-            + self.list_openai_tts
-            + self.list_vits_onnx
-        )
-        return list_tts
-
-
-def prog_disp(msg, percent, is_gui, progress=None):
-    logger.info(msg)
-    if is_gui:
-        progress(percent, desc=msg)
-
-
-def warn_disp(wrn_lang, is_gui):
-    logger.warning(wrn_lang)
-    if is_gui:
-        gr.Warning(wrn_lang)
-
-
-class SoniTrCache:
-    def __init__(self):
-        self.cache = {
-            'media': [[]],
-            'refine_vocals': [],
-            'transcript_align': [],
-            'break_align': [],
-            'diarize': [],
-            'translate': [],
-            'subs_and_edit': [],
-            'tts': [],
-            'acc_and_vc': [],
-            'mix_aud': [],
-            'output': []
-        }
-
-        self.cache_data = {
-            'media': [],
-            'refine_vocals': [],
-            'transcript_align': [],
-            'break_align': [],
-            'diarize': [],
-            'translate': [],
-            'subs_and_edit': [],
-            'tts': [],
-            'acc_and_vc': [],
-            'mix_aud': [],
-            'output': []
-        }
-
-        self.cache_keys = list(self.cache.keys())
-        self.first_task = self.cache_keys[0]
-        self.last_task = self.cache_keys[-1]
-
-        self.pre_step = None
-        self.pre_params = []
-
-    def set_variable(self, variable_name, value):
-        setattr(self, variable_name, value)
-
-    def task_in_cache(self, step: str, params: list, previous_step_data: dict):
-
-        self.pre_step_cache = None
-
-        if step == self.first_task:
-            self.pre_step = None
-
-        if self.pre_step:
-            self.cache[self.pre_step] = self.pre_params
-
-            # Fill data in cache
-            self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data)
-
-        self.pre_params = params
-        # logger.debug(f"Step: {str(step)}, Cache params: {str(self.cache)}")
-        if params == self.cache[step]:
-            logger.debug(f"In cache: {str(step)}")
-
-            # Set the var needed for next step
-            # Recovery from cache_data the current step
-            for key, value in self.cache_data[step].items():
-                self.set_variable(key, copy.deepcopy(value))
-                logger.debug(
-                    f"Chache load: {str(key)}"
-                )
-
-            self.pre_step = step
-            return True
-
-        else:
-            logger.debug(f"Flush next and caching {str(step)}")
-            selected_index = self.cache_keys.index(step)
-
-            for idx, key in enumerate(self.cache.keys()):
-                if idx >= selected_index:
-                    self.cache[key] = []
-                    self.cache_data[key] = {}
-
-            # The last is now previous
-            self.pre_step = step
-            return False
-
-    def clear_cache(self, media, force=False):
-
-        self.cache["media"] = (
-            self.cache["media"] if len(self.cache["media"]) else [[]]
-        )
-
-        if media != self.cache["media"][0] or force:
-
-            # Clear cache
-            self.cache = {key: [] for key in self.cache}
-            self.cache["media"] = [[]]
-
-            logger.info("Cache flushed")
-
-
-def get_hash(filepath):
-    with open(filepath, 'rb') as f:
-        file_hash = hashlib.blake2b()
-        while chunk := f.read(8192):
-            file_hash.update(chunk)
-
-    return file_hash.hexdigest()[:18]
-
-
-def check_openai_api_key():
-    if not os.environ.get("OPENAI_API_KEY"):
-        raise ValueError(
-            "To use GPT for translation, please set up your OpenAI API key "
-            "as an environment variable in Linux as follows: "
-            "export OPENAI_API_KEY='your-api-key-here'. Or change the "
-            "translation process in Advanced settings."
-        )
-
-
-class SoniTranslate(SoniTrCache):
-    def __init__(self, cpu_mode=False):
-        super().__init__()
-        if cpu_mode:
-            os.environ["SONITR_DEVICE"] = "cpu"
-        else:
-            os.environ["SONITR_DEVICE"] = (
-                "cuda" if torch.cuda.is_available() else "cpu"
-            )
-
-        self.device = os.environ.get("SONITR_DEVICE")
-        self.device = self.device if os.environ.get("ZERO_GPU") != "TRUE" else "cuda"
-        self.result_diarize = None
-        self.align_language = None
-        self.result_source_lang = None
-        self.edit_subs_complete = False
-        self.voiceless_id = None
-        self.burn_subs_id = None
-
-        self.vci = ClassVoices(only_cpu=cpu_mode)
-
-        self.tts_voices = self.get_tts_voice_list()
-
-        logger.info(f"Working in: {self.device}")
-
-    def get_tts_voice_list(self):
-        try:
-            from piper import PiperVoice  # noqa
-
-            piper_enabled = True
-            logger.info("PIPER TTS enabled")
-        except Exception as error:
-            logger.debug(str(error))
-            piper_enabled = False
-            logger.info("PIPER TTS disabled")
-        try:
-            from TTS.api import TTS  # noqa
-
-            xtts_enabled = True
-            logger.info("Coqui XTTS enabled")
-            logger.info(
-                "In this app, by using Coqui TTS (text-to-speech), you "
-                "acknowledge and agree to the license.\n"
-                "You confirm that you have read, understood, and agreed "
-                "to the Terms and Conditions specified at the following "
-                "link:\nhttps://coqui.ai/cpml.txt."
-            )
-            os.environ["COQUI_TOS_AGREED"] = "1"
-        except Exception as error:
-            logger.debug(str(error))
-            xtts_enabled = False
-            logger.info("Coqui XTTS disabled")
-
-        self.tts_info = TTS_Info(piper_enabled, xtts_enabled)
-
-        return self.tts_info.tts_list()
-
-    def batch_multilingual_media_conversion(self, *kwargs):
-        # logger.debug(str(kwargs))
-
-        media_file_arg = kwargs[0] if kwargs[0] is not None else []
-
-        link_media_arg = kwargs[1]
-        link_media_arg = [x.strip() for x in link_media_arg.split(',')]
-        link_media_arg = get_link_list(link_media_arg)
-
-        path_arg = kwargs[2]
-        path_arg = [x.strip() for x in path_arg.split(',')]
-        path_arg = get_valid_files(path_arg)
-
-        edit_text_arg = kwargs[31]
-        get_text_arg = kwargs[32]
-
-        is_gui_arg = kwargs[-1]
-
-        kwargs = kwargs[3:]
-
-        media_batch = media_file_arg + link_media_arg + path_arg
-        media_batch = list(filter(lambda x: x != "", media_batch))
-        media_batch = media_batch if media_batch else [None]
-        logger.debug(str(media_batch))
-
-        remove_directory_contents("outputs")
-
-        if edit_text_arg or get_text_arg:
-            return self.multilingual_media_conversion(
-                media_batch[0], "", "", *kwargs
-            )
-
-        if "SET_LIMIT" == os.getenv("DEMO") or "TRUE" == os.getenv("ZERO_GPU"):
-            media_batch = [media_batch[0]]
-
-        result = []
-        for media in media_batch:
-            # Call the nested function with the parameters
-            output_file = self.multilingual_media_conversion(
-                media, "", "", *kwargs
-            )
-
-            if isinstance(output_file, str):
-                output_file = [output_file]
-            result.extend(output_file)
-
-            if is_gui_arg and len(media_batch) > 1:
-                gr.Info(f"Done: {os.path.basename(output_file[0])}")
-
-        return result
-
-    def multilingual_media_conversion(
-        self,
-        media_file=None,
-        link_media="",
-        directory_input="",
-        YOUR_HF_TOKEN="",
-        preview=False,
-        transcriber_model="large-v3",
-        batch_size=4,
-        compute_type="auto",
-        origin_language="Automatic detection",
-        target_language="English (en)",
-        min_speakers=1,
-        max_speakers=1,
-        tts_voice00="en-US-EmmaMultilingualNeural-Female",
-        tts_voice01="en-US-AndrewMultilingualNeural-Male",
-        tts_voice02="en-US-AvaMultilingualNeural-Female",
-        tts_voice03="en-US-BrianMultilingualNeural-Male",
-        tts_voice04="de-DE-SeraphinaMultilingualNeural-Female",
-        tts_voice05="de-DE-FlorianMultilingualNeural-Male",
-        tts_voice06="fr-FR-VivienneMultilingualNeural-Female",
-        tts_voice07="fr-FR-RemyMultilingualNeural-Male",
-        tts_voice08="en-US-EmmaMultilingualNeural-Female",
-        tts_voice09="en-US-AndrewMultilingualNeural-Male",
-        tts_voice10="en-US-EmmaMultilingualNeural-Female",
-        tts_voice11="en-US-AndrewMultilingualNeural-Male",
-        video_output_name="",
-        mix_method_audio="Adjusting volumes and mixing audio",
-        max_accelerate_audio=2.1,
-        acceleration_rate_regulation=False,
-        volume_original_audio=0.25,
-        volume_translated_audio=1.80,
-        output_format_subtitle="srt",
-        get_translated_text=False,
-        get_video_from_text_json=False,
-        text_json="{}",
-        avoid_overlap=False,
-        vocal_refinement=False,
-        literalize_numbers=True,
-        segment_duration_limit=15,
-        diarization_model="pyannote_2.1",
-        translate_process="google_translator_batch",
-        subtitle_file=None,
-        output_type="video (mp4)",
-        voiceless_track=False,
-        voice_imitation=False,
-        voice_imitation_max_segments=3,
-        voice_imitation_vocals_dereverb=False,
-        voice_imitation_remove_previous=True,
-        voice_imitation_method="freevc",
-        dereverb_automatic_xtts=True,
-        text_segmentation_scale="sentence",
-        divide_text_segments_by="",
-        soft_subtitles_to_video=True,
-        burn_subtitles_to_video=False,
-        enable_cache=True,
-        custom_voices=False,
-        custom_voices_workers=1,
-        is_gui=False,
-        progress=gr.Progress(),
-    ):
-        if not YOUR_HF_TOKEN:
-            YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
-            if diarization_model == "disable" or max_speakers == 1:
-                if YOUR_HF_TOKEN is None:
-                    YOUR_HF_TOKEN = ""
-            elif not YOUR_HF_TOKEN:
-                raise ValueError("No valid Hugging Face token")
-            else:
-                os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN
-
-        if (
-            "gpt" in translate_process
-            or transcriber_model == "OpenAI_API_Whisper"
-            or "OpenAI-TTS" in tts_voice00
-        ):
-            check_openai_api_key()
-
-        if media_file is None:
-            media_file = (
-                directory_input
-                if os.path.exists(directory_input)
-                else link_media
-            )
-        media_file = (
-            media_file if isinstance(media_file, str) else media_file.name
-        )
-
-        if is_subtitle_file(media_file):
-            subtitle_file = media_file
-            media_file = ""
-
-        if media_file is None:
-            media_file = ""
-
-        if not origin_language:
-            origin_language = "Automatic detection"
-
-        if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file:
-            raise ValueError(
-                f"The language '{origin_language}' "
-                "is not supported for transcription (ASR)."
-            )
-
-        if get_translated_text:
-            self.edit_subs_complete = False
-        if get_video_from_text_json:
-            if not self.edit_subs_complete:
-                raise ValueError("Generate the transcription first.")
-
-        if (
-            ("sound" in output_type or output_type == "raw media")
-            and (get_translated_text or get_video_from_text_json)
-        ):
-            raise ValueError(
-                "Please disable 'edit generate subtitles' "
-                f"first to acquire the {output_type}."
-            )
-
-        TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
-        SOURCE_LANGUAGE = LANGUAGES[origin_language]
-
-        if (
-            transcriber_model == "OpenAI_API_Whisper"
-            and SOURCE_LANGUAGE == "zh-TW"
-        ):
-            logger.warning(
-                "OpenAI API Whisper only supports Chinese (Simplified)."
-            )
-            SOURCE_LANGUAGE = "zh"
-
-        if (
-            text_segmentation_scale in ["word", "character"]
-            and "subtitle" not in output_type
-        ):
-            wrn_lang = (
-                "Text segmentation by words or characters is typically"
-                " used for generating subtitles. If subtitles are not the"
-                " intended output, consider selecting 'sentence' "
-                "segmentation method to ensure optimal results."
-
-            )
-            warn_disp(wrn_lang, is_gui)
-
-        if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
-            wrn_lang = (
-                "Make sure to select a 'TTS Speaker' suitable for"
-                " the translation language to avoid errors with the TTS."
-            )
-            warn_disp(wrn_lang, is_gui)
-
-        if "_XTTS_" in tts_voice00 and voice_imitation:
-            wrn_lang = (
-                "When you select XTTS, it is advisable "
-                "to disable Voice Imitation."
-            )
-            warn_disp(wrn_lang, is_gui)
-
-        if custom_voices and voice_imitation:
-            wrn_lang = (
-                "When you use R.V.C. models, it is advisable"
-                " to disable Voice Imitation."
-            )
-            warn_disp(wrn_lang, is_gui)
-
-        if not media_file and not subtitle_file:
-            raise ValueError(
-                "Specifify a media or SRT file in advanced settings"
-            )
-
-        if subtitle_file:
-            subtitle_file = (
-                subtitle_file
-                if isinstance(subtitle_file, str)
-                else subtitle_file.name
-            )
-
-        if subtitle_file and SOURCE_LANGUAGE == "Automatic detection":
-            raise Exception(
-                "To use an SRT file, you need to specify its "
-                "original language (Source language)"
-            )
-
-        if not media_file and subtitle_file:
-            diarization_model = "disable"
-            media_file = "audio_support.wav"
-            if not get_video_from_text_json:
-                remove_files(media_file)
-                srt_data = srt_file_to_segments(subtitle_file)
-                total_duration = srt_data["segments"][-1]["end"] + 30.
-                support_audio = AudioSegment.silent(
-                    duration=int(total_duration * 1000)
-                )
-                support_audio.export(
-                    media_file, format="wav"
-                )
-                logger.info("Supporting audio for the SRT file, created.")
-
-        if "SET_LIMIT" == os.getenv("DEMO"):
-            preview = True
-            mix_method_audio = "Adjusting volumes and mixing audio"
-            transcriber_model = "medium"
-            logger.info(
-                "DEMO; set preview=True; Generation is limited to "
-                "10 seconds to prevent CPU errors. No limitations with GPU.\n"
-                "DEMO; set Adjusting volumes and mixing audio\n"
-                "DEMO; set whisper model to medium"
-            )
-
-        # Check GPU
-        if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU:
-            logger.info("Compute type changed to float32")
-            compute_type = "float32"
-
-        base_video_file = "Video.mp4"
-        base_audio_wav = "audio.wav"
-        dub_audio_file = "audio_dub_solo.ogg"
-        vocals_audio_file = "audio_Vocals_DeReverb.wav"
-        voiceless_audio_file = "audio_Voiceless.wav"
-        mix_audio_file = "audio_mix.mp3"
-        vid_subs = "video_subs_file.mp4"
-        video_output_file = "video_dub.mp4"
-
-        if os.path.exists(media_file):
-            media_base_hash = get_hash(media_file)
-        else:
-            media_base_hash = media_file
-        self.clear_cache(media_base_hash, force=(not enable_cache))
-
-        if not get_video_from_text_json:
-            self.result_diarize = (
-                self.align_language
-            ) = self.result_source_lang = None
-            if not self.task_in_cache("media", [media_base_hash, preview], {}):
-                if is_audio_file(media_file):
-                    prog_disp(
-                        "Processing audio...", 0.15, is_gui, progress=progress
-                    )
-                    audio_preprocessor(preview, media_file, base_audio_wav)
-                else:
-                    prog_disp(
-                        "Processing video...", 0.15, is_gui, progress=progress
-                    )
-                    audio_video_preprocessor(
-                        preview, media_file, base_video_file, base_audio_wav
-                    )
-                logger.debug("Set file complete.")
-
-            if "sound" in output_type:
-                prog_disp(
-                    "Separating sounds in the file...",
-                    0.50,
-                    is_gui,
-                    progress=progress
-                )
-                separate_out = sound_separate(base_audio_wav, output_type)
-                final_outputs = []
-                for out in separate_out:
-                    final_name = media_out(
-                        media_file,
-                        f"{get_no_ext_filename(out)}",
-                        video_output_name,
-                        "wav",
-                        file_obj=out,
-                    )
-                    final_outputs.append(final_name)
-                logger.info(f"Done: {str(final_outputs)}")
-                return final_outputs
-
-            if output_type == "raw media":
-                output = media_out(
-                    media_file,
-                    "raw_media",
-                    video_output_name,
-                    "wav" if is_audio_file(media_file) else "mp4",
-                    file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
-                )
-                logger.info(f"Done: {output}")
-                return output
-
-            if os.environ.get("IS_DEMO") == "TRUE":
-                duration_verify = librosa.get_duration(filename=base_audio_wav)
-                logger.info(f"Duration: {duration_verify} seconds")
-                if duration_verify > 1500:
-                    raise RuntimeError(
-                        "The audio is too long to process in this demo. Alternatively, you"
-                        " can install the app locally or use the Colab notebook available "
-                        "in the SoniTranslate repository."
-                    )
-                elif duration_verify > 300:
-                    tts_voices_list = [
-                        tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04,
-                        tts_voice05, tts_voice06, tts_voice07, tts_voice08, tts_voice09,
-                        tts_voice10, tts_voice11
-                    ]
-                    
-                    for tts_voice_ in tts_voices_list:
-                        if "_XTTS_" in tts_voice_:
-                            raise RuntimeError(
-                                "XTTS is too slow to be used for audio longer than 5 "
-                                "minutes in this demo. Alternatively, you can install "
-                                "the app locally or use the Colab notebook available in"
-                                " the SoniTranslate repository."
-                            )
-            
-            if not self.task_in_cache("refine_vocals", [vocal_refinement], {}):
-                self.vocals = None
-                if vocal_refinement:
-                    try:
-                        from soni_translate.mdx_net import process_uvr_task
-                        _, _, _, _, file_vocals = process_uvr_task(
-                            orig_song_path=base_audio_wav,
-                            main_vocals=False,
-                            dereverb=True,
-                            remove_files_output_dir=True,
-                        )
-                        remove_files(vocals_audio_file)
-                        copy_files(file_vocals, ".")
-                        self.vocals = vocals_audio_file
-                    except Exception as error:
-                        logger.error(str(error))
-
-            if not self.task_in_cache("transcript_align", [
-                subtitle_file,
-                SOURCE_LANGUAGE,
-                transcriber_model,
-                compute_type,
-                batch_size,
-                literalize_numbers,
-                segment_duration_limit,
-                (
-                    "l_unit"
-                    if text_segmentation_scale in ["word", "character"]
-                    and subtitle_file
-                    else "sentence"
-                )
-            ], {"vocals": self.vocals}):
-                if subtitle_file:
-                    prog_disp(
-                        "From SRT file...", 0.30, is_gui, progress=progress
-                    )
-                    audio = whisperx.load_audio(
-                        base_audio_wav if not self.vocals else self.vocals
-                    )
-                    self.result = srt_file_to_segments(subtitle_file)
-                    self.result["language"] = SOURCE_LANGUAGE
-                else:
-                    prog_disp(
-                        "Transcribing...", 0.30, is_gui, progress=progress
-                    )
-                    SOURCE_LANGUAGE = (
-                        None
-                        if SOURCE_LANGUAGE == "Automatic detection"
-                        else SOURCE_LANGUAGE
-                    )
-                    audio, self.result = transcribe_speech(
-                        base_audio_wav if not self.vocals else self.vocals,
-                        transcriber_model,
-                        compute_type,
-                        batch_size,
-                        SOURCE_LANGUAGE,
-                        literalize_numbers,
-                        segment_duration_limit,
-                    )
-                logger.debug(
-                    "Transcript complete, "
-                    f"segments count {len(self.result['segments'])}"
-                )
-
-                self.align_language = self.result["language"]
-                if (
-                    not subtitle_file
-                    or text_segmentation_scale in ["word", "character"]
-                ):
-                    prog_disp("Aligning...", 0.45, is_gui, progress=progress)
-                    try:
-                        if self.align_language in ["vi"]:
-                            logger.info(
-                                "Deficient alignment for the "
-                                f"{self.align_language} language, skipping the"
-                                " process. It is suggested to reduce the "
-                                "duration of the segments as an alternative."
-                            )
-                        else:
-                            self.result = align_speech(audio, self.result)
-                            logger.debug(
-                                "Align complete, "
-                                f"segments count {len(self.result['segments'])}"
-                            )
-                    except Exception as error:
-                        logger.error(str(error))
-
-            if self.result["segments"] == []:
-                raise ValueError("No active speech found in audio")
-
-            if not self.task_in_cache("break_align", [
-                divide_text_segments_by,
-                text_segmentation_scale,
-                self.align_language
-            ], {
-                "result": self.result,
-                "align_language": self.align_language
-            }):
-                if self.align_language in ["ja", "zh", "zh-TW"]:
-                    divide_text_segments_by += "|!|?|...|。"
-                if text_segmentation_scale in ["word", "character"]:
-                    self.result = linguistic_level_segments(
-                        self.result,
-                        text_segmentation_scale,
-                    )
-                elif divide_text_segments_by:
-                    try:
-                        self.result = break_aling_segments(
-                            self.result,
-                            break_characters=divide_text_segments_by,
-                        )
-                    except Exception as error:
-                        logger.error(str(error))
-
-            if not self.task_in_cache("diarize", [
-                min_speakers,
-                max_speakers,
-                YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2],
-                diarization_model
-            ], {
-                "result": self.result
-            }):
-                prog_disp("Diarizing...", 0.60, is_gui, progress=progress)
-                diarize_model_select = diarization_models[diarization_model]
-                self.result_diarize = diarize_speech(
-                    base_audio_wav if not self.vocals else self.vocals,
-                    self.result,
-                    min_speakers,
-                    max_speakers,
-                    YOUR_HF_TOKEN,
-                    diarize_model_select,
-                )
-                logger.debug("Diarize complete")
-            self.result_source_lang = copy.deepcopy(self.result_diarize)
-
-            if not self.task_in_cache("translate", [
-                TRANSLATE_AUDIO_TO,
-                translate_process
-            ], {
-                "result_diarize": self.result_diarize
-            }):
-                prog_disp("Translating...", 0.70, is_gui, progress=progress)
-                lang_source = (
-                    self.align_language
-                    if self.align_language
-                    else SOURCE_LANGUAGE
-                )
-                self.result_diarize["segments"] = translate_text(
-                    self.result_diarize["segments"],
-                    TRANSLATE_AUDIO_TO,
-                    translate_process,
-                    chunk_size=1800,
-                    source=lang_source,
-                )
-                logger.debug("Translation complete")
-                logger.debug(self.result_diarize)
-
-        if get_translated_text:
-
-            json_data = []
-            for segment in self.result_diarize["segments"]:
-                start = segment["start"]
-                text = segment["text"]
-                speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1
-                json_data.append(
-                    {"start": start, "text": text, "speaker": speaker}
-                )
-
-            # Convert list of dictionaries to a JSON string with indentation
-            json_string = json.dumps(json_data, indent=2)
-            logger.info("Done")
-            self.edit_subs_complete = True
-            return json_string.encode().decode("unicode_escape")
-
-        if get_video_from_text_json:
-
-            if self.result_diarize is None:
-                raise ValueError("Generate the transcription first.")
-            # with open('text_json.json', 'r') as file:
-            text_json_loaded = json.loads(text_json)
-            for i, segment in enumerate(self.result_diarize["segments"]):
-                segment["text"] = text_json_loaded[i]["text"]
-                segment["speaker"] = "SPEAKER_{:02d}".format(
-                    int(text_json_loaded[i]["speaker"]) - 1
-                )
-
-        # Write subtitle
-        if not self.task_in_cache("subs_and_edit", [
-            copy.deepcopy(self.result_diarize),
-            output_format_subtitle,
-            TRANSLATE_AUDIO_TO
-        ], {
-            "result_diarize": self.result_diarize
-        }):
-            if output_format_subtitle == "disable":
-                self.sub_file = "sub_tra.srt"
-            elif output_format_subtitle != "ass":
-                self.sub_file = process_subtitles(
-                    self.result_source_lang,
-                    self.align_language,
-                    self.result_diarize,
-                    output_format_subtitle,
-                    TRANSLATE_AUDIO_TO,
-                )
-
-            # Need task
-            if output_format_subtitle != "srt":
-                _ = process_subtitles(
-                    self.result_source_lang,
-                    self.align_language,
-                    self.result_diarize,
-                    "srt",
-                    TRANSLATE_AUDIO_TO,
-                )
-
-            if output_format_subtitle == "ass":
-                convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y"
-                convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y"
-                self.sub_file = "sub_tra.ass"
-                run_command(convert_ori)
-                run_command(convert_tra)
-
-        format_sub = (
-            output_format_subtitle
-            if output_format_subtitle != "disable"
-            else "srt"
-        )
-
-        if output_type == "subtitle":
-
-            out_subs = []
-            tra_subs = media_out(
-                media_file,
-                TRANSLATE_AUDIO_TO,
-                video_output_name,
-                format_sub,
-                file_obj=self.sub_file,
-            )
-            out_subs.append(tra_subs)
-
-            ori_subs = media_out(
-                media_file,
-                self.align_language,
-                video_output_name,
-                format_sub,
-                file_obj=f"sub_ori.{format_sub}",
-            )
-            out_subs.append(ori_subs)
-            logger.info(f"Done: {out_subs}")
-            return out_subs
-
-        if output_type == "subtitle [by speaker]":
-            output = get_subtitle_speaker(
-                media_file,
-                result=self.result_diarize,
-                language=TRANSLATE_AUDIO_TO,
-                extension=format_sub,
-                base_name=video_output_name,
-            )
-            logger.info(f"Done: {str(output)}")
-            return output
-
-        if "video [subtitled]" in output_type:
-            output = media_out(
-                media_file,
-                TRANSLATE_AUDIO_TO + "_subtitled",
-                video_output_name,
-                "wav" if is_audio_file(media_file) else (
-                    "mkv" if "mkv" in output_type else "mp4"
-                ),
-                file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
-                soft_subtitles=False if is_audio_file(media_file) else True,
-                subtitle_files=output_format_subtitle,
-            )
-            msg_out = output[0] if isinstance(output, list) else output
-            logger.info(f"Done: {msg_out}")
-            return output
-
-        if not self.task_in_cache("tts", [
-            TRANSLATE_AUDIO_TO,
-            tts_voice00,
-            tts_voice01,
-            tts_voice02,
-            tts_voice03,
-            tts_voice04,
-            tts_voice05,
-            tts_voice06,
-            tts_voice07,
-            tts_voice08,
-            tts_voice09,
-            tts_voice10,
-            tts_voice11,
-            dereverb_automatic_xtts
-        ], {
-            "sub_file": self.sub_file
-        }):
-            prog_disp("Text to speech...", 0.80, is_gui, progress=progress)
-            self.valid_speakers = audio_segmentation_to_voice(
-                self.result_diarize,
-                TRANSLATE_AUDIO_TO,
-                is_gui,
-                tts_voice00,
-                tts_voice01,
-                tts_voice02,
-                tts_voice03,
-                tts_voice04,
-                tts_voice05,
-                tts_voice06,
-                tts_voice07,
-                tts_voice08,
-                tts_voice09,
-                tts_voice10,
-                tts_voice11,
-                dereverb_automatic_xtts,
-            )
-
-        if not self.task_in_cache("acc_and_vc", [
-            max_accelerate_audio,
-            acceleration_rate_regulation,
-            voice_imitation,
-            voice_imitation_max_segments,
-            voice_imitation_remove_previous,
-            voice_imitation_vocals_dereverb,
-            voice_imitation_method,
-            custom_voices,
-            custom_voices_workers,
-            copy.deepcopy(self.vci.model_config),
-            avoid_overlap
-        ], {
-            "valid_speakers": self.valid_speakers
-        }):
-            audio_files, speakers_list = accelerate_segments(
-                    self.result_diarize,
-                    max_accelerate_audio,
-                    self.valid_speakers,
-                    acceleration_rate_regulation,
-                )
-
-            # Voice Imitation (Tone color converter)
-            if voice_imitation:
-                prog_disp(
-                    "Voice Imitation...", 0.85, is_gui, progress=progress
-                )
-                from soni_translate.text_to_speech import toneconverter
-
-                try:
-                    toneconverter(
-                        copy.deepcopy(self.result_diarize),
-                        voice_imitation_max_segments,
-                        voice_imitation_remove_previous,
-                        voice_imitation_vocals_dereverb,
-                        voice_imitation_method,
-                    )
-                except Exception as error:
-                    logger.error(str(error))
-
-            # custom voice
-            if custom_voices:
-                prog_disp(
-                    "Applying customized voices...",
-                    0.90,
-                    is_gui,
-                    progress=progress,
-                )
-
-                try:
-                    self.vci(
-                        audio_files,
-                        speakers_list,
-                        overwrite=True,
-                        parallel_workers=custom_voices_workers,
-                    )
-                    self.vci.unload_models()
-                except Exception as error:
-                    logger.error(str(error))
-
-            prog_disp(
-                "Creating final translated video...",
-                0.95,
-                is_gui,
-                progress=progress,
-            )
-            remove_files(dub_audio_file)
-            create_translated_audio(
-                self.result_diarize,
-                audio_files,
-                dub_audio_file,
-                False,
-                avoid_overlap,
-            )
-
-        # Voiceless track, change with file
-        hash_base_audio_wav = get_hash(base_audio_wav)
-        if voiceless_track:
-            if self.voiceless_id != hash_base_audio_wav:
-                from soni_translate.mdx_net import process_uvr_task
-
-                try:
-                    # voiceless_audio_file_dir = "clean_song_output/voiceless"
-                    remove_files(voiceless_audio_file)
-                    uvr_voiceless_audio_wav, _ = process_uvr_task(
-                        orig_song_path=base_audio_wav,
-                        song_id="voiceless",
-                        only_voiceless=True,
-                        remove_files_output_dir=False,
-                    )
-                    copy_files(uvr_voiceless_audio_wav, ".")
-                    base_audio_wav = voiceless_audio_file
-                    self.voiceless_id = hash_base_audio_wav
-
-                except Exception as error:
-                    logger.error(str(error))
-            else:
-                base_audio_wav = voiceless_audio_file
-
-        if not self.task_in_cache("mix_aud", [
-            mix_method_audio,
-            volume_original_audio,
-            volume_translated_audio,
-            voiceless_track
-        ], {}):
-            # TYPE MIX AUDIO
-            remove_files(mix_audio_file)
-            command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}'
-            command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}'
-            if mix_method_audio == "Adjusting volumes and mixing audio":
-                # volume mix
-                run_command(command_volume_mix)
-            else:
-                try:
-                    # background mix
-                    run_command(command_background_mix)
-                except Exception as error_mix:
-                    # volume mix except
-                    logger.error(str(error_mix))
-                    run_command(command_volume_mix)
-
-        if "audio" in output_type or is_audio_file(media_file):
-            output = media_out(
-                media_file,
-                TRANSLATE_AUDIO_TO,
-                video_output_name,
-                "wav" if "wav" in output_type else (
-                    "ogg" if "ogg" in output_type else "mp3"
-                ),
-                file_obj=mix_audio_file,
-                subtitle_files=output_format_subtitle,
-            )
-            msg_out = output[0] if isinstance(output, list) else output
-            logger.info(f"Done: {msg_out}")
-            return output
-
-        hash_base_video_file = get_hash(base_video_file)
-
-        if burn_subtitles_to_video:
-            hashvideo_text = [
-                hash_base_video_file,
-                [seg["text"] for seg in self.result_diarize["segments"]]
-            ]
-            if self.burn_subs_id != hashvideo_text:
-                try:
-                    logger.info("Burn subtitles")
-                    remove_files(vid_subs)
-                    command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}"
-                    run_command(command)
-                    base_video_file = vid_subs
-                    self.burn_subs_id = hashvideo_text
-                except Exception as error:
-                    logger.error(str(error))
-            else:
-                base_video_file = vid_subs
-
-        if not self.task_in_cache("output", [
-            hash_base_video_file,
-            hash_base_audio_wav,
-            burn_subtitles_to_video
-        ], {}):
-            # Merge new audio + video
-            remove_files(video_output_file)
-            run_command(
-                f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}"
-            )
-
-        output = media_out(
-            media_file,
-            TRANSLATE_AUDIO_TO,
-            video_output_name,
-            "mkv" if "mkv" in output_type else "mp4",
-            file_obj=video_output_file,
-            soft_subtitles=soft_subtitles_to_video,
-            subtitle_files=output_format_subtitle,
-        )
-        msg_out = output[0] if isinstance(output, list) else output
-        logger.info(f"Done: {msg_out}")
-
-        return output
-
-    def hook_beta_processor(
-        self,
-        document,
-        tgt_lang,
-        translate_process,
-        ori_lang,
-        tts,
-        name_final_file,
-        custom_voices,
-        custom_voices_workers,
-        output_type,
-        chunk_size,
-        width,
-        height,
-        start_page,
-        end_page,
-        bcolor,
-        is_gui,
-        progress
-    ):
-        prog_disp("Processing pages...", 0.10, is_gui, progress=progress)
-        doc_data = doc_to_txtximg_pages(document,  width, height, start_page, end_page, bcolor)
-        result_diarize = page_data_to_segments(doc_data, 1700)
-
-        prog_disp("Translating...", 0.20, is_gui, progress=progress)
-        result_diarize["segments"] = translate_text(
-            result_diarize["segments"],
-            tgt_lang,
-            translate_process,
-            chunk_size=0,
-            source=ori_lang,
-        )
-        chunk_size = (
-            chunk_size if chunk_size else determine_chunk_size(tts)
-        )
-        doc_data = update_page_data(result_diarize, doc_data)
-
-        prog_disp("Text to speech...", 0.30, is_gui, progress=progress)
-        result_diarize = page_data_to_segments(doc_data, chunk_size)
-        valid_speakers = audio_segmentation_to_voice(
-            result_diarize,
-            tgt_lang,
-            is_gui,
-            tts,
-        )
-
-        # fix format and set folder output
-        audio_files, speakers_list = accelerate_segments(
-                result_diarize,
-                1.0,
-                valid_speakers,
-            )
-
-        # custom voice
-        if custom_voices:
-            prog_disp(
-                "Applying customized voices...",
-                0.60,
-                is_gui,
-                progress=progress,
-            )
-            self.vci(
-                audio_files,
-                speakers_list,
-                overwrite=True,
-                parallel_workers=custom_voices_workers,
-            )
-            self.vci.unload_models()
-
-        # Update time segments and not concat
-        result_diarize = fix_timestamps_docs(result_diarize, audio_files)
-        final_wav_file = "audio_book.wav"
-        remove_files(final_wav_file)
-
-        prog_disp("Creating audio file...", 0.70, is_gui, progress=progress)
-        create_translated_audio(
-            result_diarize, audio_files, final_wav_file, False
-        )
-
-        prog_disp("Creating video file...", 0.80, is_gui, progress=progress)
-        video_doc = create_video_from_images(
-                doc_data,
-                result_diarize
-        )
-
-        # Merge video and audio
-        prog_disp("Merging...", 0.90, is_gui, progress=progress)
-        vid_out = merge_video_and_audio(video_doc, final_wav_file)
-
-        # End
-        output = media_out(
-            document,
-            tgt_lang,
-            name_final_file,
-            "mkv" if "mkv" in output_type else "mp4",
-            file_obj=vid_out,
-        )
-        logger.info(f"Done: {output}")
-        return output
-
-    def multilingual_docs_conversion(
-        self,
-        string_text="",  # string
-        document=None,  # doc path gui
-        directory_input="",  # doc path
-        origin_language="English (en)",
-        target_language="English (en)",
-        tts_voice00="en-US-EmmaMultilingualNeural-Female",
-        name_final_file="",
-        translate_process="google_translator",
-        output_type="audio",
-        chunk_size=None,
-        custom_voices=False,
-        custom_voices_workers=1,
-        start_page=1,
-        end_page=99999,
-        width=1280,
-        height=720,
-        bcolor="dynamic",
-        is_gui=False,
-        progress=gr.Progress(),
-    ):
-        if "gpt" in translate_process:
-            check_openai_api_key()
-
-        SOURCE_LANGUAGE = LANGUAGES[origin_language]
-        if translate_process != "disable_translation":
-            TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
-        else:
-            TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE
-            logger.info("No translation")
-        if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
-            logger.debug(
-                "Make sure to select a 'TTS Speaker' suitable for the "
-                "translation language to avoid errors with the TTS."
-            )
-
-        self.clear_cache(string_text, force=True)
-
-        is_string = False
-        if document is None:
-            if os.path.exists(directory_input):
-                document = directory_input
-            else:
-                document = string_text
-                is_string = True
-        document = document if isinstance(document, str) else document.name
-        if not document:
-            raise Exception("No data found")
-
-        if os.environ.get("IS_DEMO") == "TRUE" and not is_string:
-            raise RuntimeError(
-                "This option is disabled in this demo. "
-                "Alternatively, you can install "
-                "the app locally or use the Colab notebook available in"
-                " the SoniTranslate repository."
-            )
-
-        if "videobook" in output_type:
-            if not document.lower().endswith(".pdf"):
-                raise ValueError(
-                    "Videobooks are only compatible with PDF files."
-                )
-
-            return self.hook_beta_processor(
-                document,
-                TRANSLATE_AUDIO_TO,
-                translate_process,
-                SOURCE_LANGUAGE,
-                tts_voice00,
-                name_final_file,
-                custom_voices,
-                custom_voices_workers,
-                output_type,
-                chunk_size,
-                width,
-                height,
-                start_page,
-                end_page,
-                bcolor,
-                is_gui,
-                progress
-            )
-
-        # audio_wav = "audio.wav"
-        final_wav_file = "audio_book.wav"
-
-        prog_disp("Processing text...", 0.15, is_gui, progress=progress)
-        result_file_path, result_text = document_preprocessor(
-            document, is_string, start_page, end_page
-        )
-
-        if (
-            output_type == "book (txt)"
-            and translate_process == "disable_translation"
-        ):
-            return result_file_path
-
-        if "SET_LIMIT" == os.getenv("DEMO"):
-            result_text = result_text[:50]
-            logger.info(
-                "DEMO; Generation is limited to 50 characters to prevent "
-                "CPU errors. No limitations with GPU.\n"
-            )
-
-        if translate_process != "disable_translation":
-            # chunks text for translation
-            result_diarize = plain_text_to_segments(result_text, 1700)
-            prog_disp("Translating...", 0.30, is_gui, progress=progress)
-            # not or iterative with 1700 chars
-            result_diarize["segments"] = translate_text(
-                result_diarize["segments"],
-                TRANSLATE_AUDIO_TO,
-                translate_process,
-                chunk_size=0,
-                source=SOURCE_LANGUAGE,
-            )
-
-            txt_file_path, result_text = segments_to_plain_text(result_diarize)
-
-            if output_type == "book (txt)":
-                return media_out(
-                    result_file_path if is_string else document,
-                    TRANSLATE_AUDIO_TO,
-                    name_final_file,
-                    "txt",
-                    file_obj=txt_file_path,
-                )
-
-        # (TTS limits) plain text to result_diarize
-        chunk_size = (
-            chunk_size if chunk_size else determine_chunk_size(tts_voice00)
-        )
-        result_diarize = plain_text_to_segments(result_text, chunk_size)
-        logger.debug(result_diarize)
-
-        prog_disp("Text to speech...", 0.45, is_gui, progress=progress)
-        valid_speakers = audio_segmentation_to_voice(
-            result_diarize,
-            TRANSLATE_AUDIO_TO,
-            is_gui,
-            tts_voice00,
-        )
-
-        # fix format and set folder output
-        audio_files, speakers_list = accelerate_segments(
-                result_diarize,
-                1.0,
-                valid_speakers,
-            )
-
-        # custom voice
-        if custom_voices:
-            prog_disp(
-                "Applying customized voices...",
-                0.80,
-                is_gui,
-                progress=progress,
-            )
-            self.vci(
-                audio_files,
-                speakers_list,
-                overwrite=True,
-                parallel_workers=custom_voices_workers,
-            )
-            self.vci.unload_models()
-
-        prog_disp(
-            "Creating final audio file...", 0.90, is_gui, progress=progress
-        )
-        remove_files(final_wav_file)
-        create_translated_audio(
-            result_diarize, audio_files, final_wav_file, True
-        )
-
-        output = media_out(
-            result_file_path if is_string else document,
-            TRANSLATE_AUDIO_TO,
-            name_final_file,
-            "mp3" if "mp3" in output_type else (
-                "ogg" if "ogg" in output_type else "wav"
-            ),
-            file_obj=final_wav_file,
-        )
-
-        logger.info(f"Done: {output}")
-
-        return output
-
-
-title = "<center><strong><font size='7'>📽️ SoniTranslate 🈷️</font></strong></center>"
-
-
-def create_gui(theme, logs_in_gui=False):
-    with gr.Blocks(theme=theme) as app:
-        gr.Markdown(title)
-        gr.Markdown(lg_conf["description"])
-
-        if os.environ.get("ZERO_GPU") == "TRUE":
-            gr.Markdown(
-                """
-            
-                <details>
-                    <summary style="font-size: 1.5em;">⚠️ Important ⚠️</summary>
-                    <ul>
-                        <li>🚀 This demo uses a zero GPU setup only for the transcription and diarization process. Everything else runs on the CPU. It is recommended to use videos no longer than 15 minutes. ⏳</li>
-                        <li>❗ If you see `queue` when using this, it means another user is currently using it, and you need to wait until they are finished.</li>
-                        <li>🔒 Some functions are disabled, but if you duplicate this with a GPU and set the value in secrets "ZERO_GPU" to FALSE, you can use the app with full GPU acceleration. ⚡</li>
-                    </ul>
-                </details>
-                """
-            )
-
-        with gr.Tab(lg_conf["tab_translate"]):
-            with gr.Row():
-                with gr.Column():
-                    input_data_type = gr.Dropdown(
-                        ["SUBMIT VIDEO", "URL", "Find Video Path"],
-                        value="SUBMIT VIDEO",
-                        label=lg_conf["video_source"],
-                    )
-
-                    def swap_visibility(data_type):
-                        if data_type == "URL":
-                            return (
-                                gr.update(visible=False, value=None),
-                                gr.update(visible=True, value=""),
-                                gr.update(visible=False, value=""),
-                            )
-                        elif data_type == "SUBMIT VIDEO":
-                            return (
-                                gr.update(visible=True, value=None),
-                                gr.update(visible=False, value=""),
-                                gr.update(visible=False, value=""),
-                            )
-                        elif data_type == "Find Video Path":
-                            return (
-                                gr.update(visible=False, value=None),
-                                gr.update(visible=False, value=""),
-                                gr.update(visible=True, value=""),
-                            )
-
-                    video_input = gr.File(
-                        label="VIDEO",
-                        file_count="multiple",
-                        type="filepath",
-                    )
-                    blink_input = gr.Textbox(
-                        visible=False,
-                        label=lg_conf["link_label"],
-                        info=lg_conf["link_info"],
-                        placeholder=lg_conf["link_ph"],
-                    )
-                    directory_input = gr.Textbox(
-                        visible=False,
-                        label=lg_conf["dir_label"],
-                        info=lg_conf["dir_info"],
-                        placeholder=lg_conf["dir_ph"],
-                    )
-                    input_data_type.change(
-                        fn=swap_visibility,
-                        inputs=input_data_type,
-                        outputs=[video_input, blink_input, directory_input],
-                    )
-
-                    gr.HTML()
-
-                    SOURCE_LANGUAGE = gr.Dropdown(
-                        LANGUAGES_LIST,
-                        value=LANGUAGES_LIST[0],
-                        label=lg_conf["sl_label"],
-                        info=lg_conf["sl_info"],
-                    )
-                    TRANSLATE_AUDIO_TO = gr.Dropdown(
-                        LANGUAGES_LIST[1:],
-                        value="English (en)",
-                        label=lg_conf["tat_label"],
-                        info=lg_conf["tat_info"],
-                    )
-
-                    gr.HTML("<hr></h2>")
-
-                    gr.Markdown(lg_conf["num_speakers"])
-                    MAX_TTS = 12
-                    min_speakers = gr.Slider(
-                        1,
-                        MAX_TTS,
-                        value=1,
-                        label=lg_conf["min_sk"],
-                        step=1,
-                        visible=False,
-                    )
-                    max_speakers = gr.Slider(
-                        1,
-                        MAX_TTS,
-                        value=1,
-                        step=1,
-                        label=lg_conf["max_sk"],
-                    )
-                    gr.Markdown(lg_conf["tts_select"])
-
-                    def submit(value):
-                        visibility_dict = {
-                            f"tts_voice{i:02d}": gr.update(visible=i < value)
-                            for i in range(MAX_TTS)
-                        }
-                        return [value for value in visibility_dict.values()]
-
-                    tts_voice00 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="en-US-EmmaMultilingualNeural-Female",
-                        label=lg_conf["sk1"],
-                        visible=True,
-                        interactive=True,
-                    )
-                    tts_voice01 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="en-US-AndrewMultilingualNeural-Male",
-                        label=lg_conf["sk2"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice02 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="en-US-AvaMultilingualNeural-Female",
-                        label=lg_conf["sk3"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice03 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="en-US-BrianMultilingualNeural-Male",
-                        label=lg_conf["sk4"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice04 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="de-DE-SeraphinaMultilingualNeural-Female",
-                        label=lg_conf["sk4"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice05 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="de-DE-FlorianMultilingualNeural-Male",
-                        label=lg_conf["sk6"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice06 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="fr-FR-VivienneMultilingualNeural-Female",
-                        label=lg_conf["sk7"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice07 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="fr-FR-RemyMultilingualNeural-Male",
-                        label=lg_conf["sk8"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice08 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="en-US-EmmaMultilingualNeural-Female",
-                        label=lg_conf["sk9"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice09 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="en-US-AndrewMultilingualNeural-Male",
-                        label=lg_conf["sk10"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice10 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="en-US-EmmaMultilingualNeural-Female",
-                        label=lg_conf["sk11"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    tts_voice11 = gr.Dropdown(
-                        SoniTr.tts_info.tts_list(),
-                        value="en-US-AndrewMultilingualNeural-Male",
-                        label=lg_conf["sk12"],
-                        visible=False,
-                        interactive=True,
-                    )
-                    max_speakers.change(
-                        submit,
-                        max_speakers,
-                        [
-                            tts_voice00,
-                            tts_voice01,
-                            tts_voice02,
-                            tts_voice03,
-                            tts_voice04,
-                            tts_voice05,
-                            tts_voice06,
-                            tts_voice07,
-                            tts_voice08,
-                            tts_voice09,
-                            tts_voice10,
-                            tts_voice11,
-                        ],
-                    )
-
-                    with gr.Column():
-                        with gr.Accordion(
-                            lg_conf["vc_title"],
-                            open=False,
-                        ):
-                            gr.Markdown(lg_conf["vc_subtitle"])
-                            voice_imitation_gui = gr.Checkbox(
-                                False,
-                                label=lg_conf["vc_active_label"],
-                                info=lg_conf["vc_active_info"],
-                            )
-                            openvoice_models = ["openvoice", "openvoice_v2"]
-                            voice_imitation_method_options = (
-                                ["freevc"] + openvoice_models
-                                if SoniTr.tts_info.xtts_enabled
-                                else openvoice_models
-                            )
-                            voice_imitation_method_gui = gr.Dropdown(
-                                voice_imitation_method_options,
-                                value=voice_imitation_method_options[-1],
-                                label=lg_conf["vc_method_label"],
-                                info=lg_conf["vc_method_info"],
-                            )
-                            voice_imitation_max_segments_gui = gr.Slider(
-                                label=lg_conf["vc_segments_label"],
-                                info=lg_conf["vc_segments_info"],
-                                value=3,
-                                step=1,
-                                minimum=1,
-                                maximum=10,
-                                visible=True,
-                                interactive=True,
-                            )
-                            voice_imitation_vocals_dereverb_gui = gr.Checkbox(
-                                False,
-                                label=lg_conf["vc_dereverb_label"],
-                                info=lg_conf["vc_dereverb_info"],
-                            )
-                            voice_imitation_remove_previous_gui = gr.Checkbox(
-                                True,
-                                label=lg_conf["vc_remove_label"],
-                                info=lg_conf["vc_remove_info"],
-                            )
-
-                    if SoniTr.tts_info.xtts_enabled:
-                        with gr.Column():
-                            with gr.Accordion(
-                                lg_conf["xtts_title"],
-                                open=False,
-                            ):
-                                gr.Markdown(lg_conf["xtts_subtitle"])
-                                wav_speaker_file = gr.File(
-                                    label=lg_conf["xtts_file_label"]
-                                )
-                                wav_speaker_name = gr.Textbox(
-                                    label=lg_conf["xtts_name_label"],
-                                    value="",
-                                    info=lg_conf["xtts_name_info"],
-                                    placeholder="default_name",
-                                    lines=1,
-                                )
-                                wav_speaker_start = gr.Number(
-                                    label="Time audio start",
-                                    value=0,
-                                    visible=False,
-                                )
-                                wav_speaker_end = gr.Number(
-                                    label="Time audio end",
-                                    value=0,
-                                    visible=False,
-                                )
-                                wav_speaker_dir = gr.Textbox(
-                                    label="Directory save",
-                                    value="_XTTS_",
-                                    visible=False,
-                                )
-                                wav_speaker_dereverb = gr.Checkbox(
-                                    True,
-                                    label=lg_conf["xtts_dereverb_label"],
-                                    info=lg_conf["xtts_dereverb_info"]
-                                )
-                                wav_speaker_output = gr.HTML()
-                                create_xtts_wav = gr.Button(
-                                    lg_conf["xtts_button"]
-                                )
-                                gr.Markdown(lg_conf["xtts_footer"])
-                    else:
-                        wav_speaker_dereverb = gr.Checkbox(
-                            False,
-                            label=lg_conf["xtts_dereverb_label"],
-                            info=lg_conf["xtts_dereverb_info"],
-                            visible=False
-                        )
-
-                    with gr.Column():
-                        with gr.Accordion(
-                            lg_conf["extra_setting"], open=False
-                        ):
-                            audio_accelerate = gr.Slider(
-                                label=lg_conf["acc_max_label"],
-                                value=1.9,
-                                step=0.1,
-                                minimum=1.0,
-                                maximum=2.5,
-                                visible=True,
-                                interactive=True,
-                                info=lg_conf["acc_max_info"],
-                            )
-                            acceleration_rate_regulation_gui = gr.Checkbox(
-                                False,
-                                label=lg_conf["acc_rate_label"],
-                                info=lg_conf["acc_rate_info"],
-                            )
-                            avoid_overlap_gui = gr.Checkbox(
-                                False,
-                                label=lg_conf["or_label"],
-                                info=lg_conf["or_info"],
-                            )
-
-                            gr.HTML("<hr></h2>")
-
-                            audio_mix_options = [
-                                "Mixing audio with sidechain compression",
-                                "Adjusting volumes and mixing audio",
-                            ]
-                            AUDIO_MIX = gr.Dropdown(
-                                audio_mix_options,
-                                value=audio_mix_options[1],
-                                label=lg_conf["aud_mix_label"],
-                                info=lg_conf["aud_mix_info"],
-                            )
-                            volume_original_mix = gr.Slider(
-                                label=lg_conf["vol_ori"],
-                                info="for Adjusting volumes and mixing audio",
-                                value=0.25,
-                                step=0.05,
-                                minimum=0.0,
-                                maximum=2.50,
-                                visible=True,
-                                interactive=True,
-                            )
-                            volume_translated_mix = gr.Slider(
-                                label=lg_conf["vol_tra"],
-                                info="for Adjusting volumes and mixing audio",
-                                value=1.80,
-                                step=0.05,
-                                minimum=0.0,
-                                maximum=2.50,
-                                visible=True,
-                                interactive=True,
-                            )
-                            main_voiceless_track = gr.Checkbox(
-                                label=lg_conf["voiceless_tk_label"],
-                                info=lg_conf["voiceless_tk_info"],
-                            )
-
-                            gr.HTML("<hr></h2>")
-                            sub_type_options = [
-                                "disable",
-                                "srt",
-                                "vtt",
-                                "ass",
-                                "txt",
-                                "tsv",
-                                "json",
-                                "aud",
-                            ]
-
-                            sub_type_output = gr.Dropdown(
-                                sub_type_options,
-                                value=sub_type_options[1],
-                                label=lg_conf["sub_type"],
-                            )
-                            soft_subtitles_to_video_gui = gr.Checkbox(
-                                label=lg_conf["soft_subs_label"],
-                                info=lg_conf["soft_subs_info"],
-                            )
-                            burn_subtitles_to_video_gui = gr.Checkbox(
-                                label=lg_conf["burn_subs_label"],
-                                info=lg_conf["burn_subs_info"],
-                            )
-
-                            gr.HTML("<hr></h2>")
-                            gr.Markdown(lg_conf["whisper_title"])
-                            literalize_numbers_gui = gr.Checkbox(
-                                True,
-                                label=lg_conf["lnum_label"],
-                                info=lg_conf["lnum_info"],
-                            )
-                            vocal_refinement_gui = gr.Checkbox(
-                                False,
-                                label=lg_conf["scle_label"],
-                                info=lg_conf["scle_info"],
-                            )
-                            segment_duration_limit_gui = gr.Slider(
-                                label=lg_conf["sd_limit_label"],
-                                info=lg_conf["sd_limit_info"],
-                                value=15,
-                                step=1,
-                                minimum=1,
-                                maximum=30,
-                            )
-                            whisper_model_default = (
-                                "large-v3"
-                                if SoniTr.device == "cuda"
-                                else "medium"
-                            )
-
-                            WHISPER_MODEL_SIZE = gr.Dropdown(
-                                ASR_MODEL_OPTIONS + find_whisper_models(),
-                                value=whisper_model_default,
-                                label="Whisper ASR model",
-                                info=lg_conf["asr_model_info"],
-                                allow_custom_value=True,
-                            )
-                            com_t_opt, com_t_default = (
-                                [COMPUTE_TYPE_GPU, "float16"]
-                                if SoniTr.device == "cuda"
-                                else [COMPUTE_TYPE_CPU, "float32"]
-                            )
-                            compute_type = gr.Dropdown(
-                                com_t_opt,
-                                value=com_t_default,
-                                label=lg_conf["ctype_label"],
-                                info=lg_conf["ctype_info"],
-                            )
-                            batch_size_value = 8 if os.environ.get("ZERO_GPU") != "TRUE" else 32
-                            batch_size = gr.Slider(
-                                minimum=1,
-                                maximum=32,
-                                value=batch_size_value,
-                                label=lg_conf["batchz_label"],
-                                info=lg_conf["batchz_info"],
-                                step=1,
-                            )
-                            input_srt = gr.File(
-                                label=lg_conf["srt_file_label"],
-                                file_types=[".srt", ".ass", ".vtt"],
-                                height=130,
-                            )
-
-                            gr.HTML("<hr></h2>")
-                            text_segmentation_options = [
-                                "sentence",
-                                "word",
-                                "character"
-                            ]
-                            text_segmentation_scale_gui = gr.Dropdown(
-                                text_segmentation_options,
-                                value=text_segmentation_options[0],
-                                label=lg_conf["tsscale_label"],
-                                info=lg_conf["tsscale_info"],
-                            )
-                            divide_text_segments_by_gui = gr.Textbox(
-                                label=lg_conf["divide_text_label"],
-                                value="",
-                                info=lg_conf["divide_text_info"],
-                            )
-
-                            gr.HTML("<hr></h2>")
-                            pyannote_models_list = list(
-                                diarization_models.keys()
-                            )
-                            diarization_process_dropdown = gr.Dropdown(
-                                pyannote_models_list,
-                                value=pyannote_models_list[1],
-                                label=lg_conf["diarization_label"],
-                            )
-                            translate_process_dropdown = gr.Dropdown(
-                                TRANSLATION_PROCESS_OPTIONS,
-                                value=TRANSLATION_PROCESS_OPTIONS[0],
-                                label=lg_conf["tr_process_label"],
-                            )
-
-                            gr.HTML("<hr></h2>")
-                            main_output_type = gr.Dropdown(
-                                OUTPUT_TYPE_OPTIONS,
-                                value=OUTPUT_TYPE_OPTIONS[0],
-                                label=lg_conf["out_type_label"],
-                            )
-                            VIDEO_OUTPUT_NAME = gr.Textbox(
-                                label=lg_conf["out_name_label"],
-                                value="",
-                                info=lg_conf["out_name_info"],
-                            )
-                            play_sound_gui = gr.Checkbox(
-                                True,
-                                label=lg_conf["task_sound_label"],
-                                info=lg_conf["task_sound_info"],
-                            )
-                            enable_cache_gui = gr.Checkbox(
-                                True,
-                                label=lg_conf["cache_label"],
-                                info=lg_conf["cache_info"],
-                            )
-                            PREVIEW = gr.Checkbox(
-                                label="Preview", info=lg_conf["preview_info"]
-                            )
-                            is_gui_dummy_check = gr.Checkbox(
-                                True, visible=False
-                            )
-
-                with gr.Column(variant="compact"):
-                    edit_sub_check = gr.Checkbox(
-                        label=lg_conf["edit_sub_label"],
-                        info=lg_conf["edit_sub_info"],
-                        interactive=(False if os.environ.get("IS_DEMO") == "TRUE" else True),
-                    )
-                    dummy_false_check = gr.Checkbox(
-                        False,
-                        visible=False,
-                    )
-
-                    def visible_component_subs(input_bool):
-                        if input_bool:
-                            return gr.update(visible=True), gr.update(
-                                visible=True
-                            )
-                        else:
-                            return gr.update(visible=False), gr.update(
-                                visible=False
-                            )
-
-                    subs_button = gr.Button(
-                        lg_conf["button_subs"],
-                        variant="primary",
-                        visible=False,
-                    )
-                    subs_edit_space = gr.Textbox(
-                        visible=False,
-                        lines=10,
-                        label=lg_conf["editor_sub_label"],
-                        info=lg_conf["editor_sub_info"],
-                        placeholder=lg_conf["editor_sub_ph"],
-                    )
-                    edit_sub_check.change(
-                        visible_component_subs,
-                        [edit_sub_check],
-                        [subs_button, subs_edit_space],
-                    )
-
-                    with gr.Row():
-                        video_button = gr.Button(
-                            lg_conf["button_translate"],
-                            variant="primary",
-                        )
-                    with gr.Row():
-                        video_output = gr.File(
-                            label=lg_conf["output_result_label"],
-                            file_count="multiple",
-                            interactive=False,
-
-                        )  # gr.Video()
-
-                    gr.HTML("<hr></h2>")
-
-                    if (
-                        os.getenv("YOUR_HF_TOKEN") is None
-                        or os.getenv("YOUR_HF_TOKEN") == ""
-                    ):
-                        HFKEY = gr.Textbox(
-                            visible=True,
-                            label="HF Token",
-                            info=lg_conf["ht_token_info"],
-                            placeholder=lg_conf["ht_token_ph"],
-                        )
-                    else:
-                        HFKEY = gr.Textbox(
-                            visible=False,
-                            label="HF Token",
-                            info=lg_conf["ht_token_info"],
-                            placeholder=lg_conf["ht_token_ph"],
-                        )
-
-                    gr.Examples(
-                        examples=[
-                            [
-                                ["./assets/Video_main.mp4"],
-                                "",
-                                "",
-                                "",
-                                False,
-                                whisper_model_default,
-                                batch_size_value,
-                                com_t_default,
-                                "Spanish (es)",
-                                "English (en)",
-                                1,
-                                2,
-                                "en-US-EmmaMultilingualNeural-Female",
-                                "en-US-AndrewMultilingualNeural-Male",
-                            ],
-                        ],  # no update
-                        fn=SoniTr.batch_multilingual_media_conversion,
-                        inputs=[
-                            video_input,
-                            blink_input,
-                            directory_input,
-                            HFKEY,
-                            PREVIEW,
-                            WHISPER_MODEL_SIZE,
-                            batch_size,
-                            compute_type,
-                            SOURCE_LANGUAGE,
-                            TRANSLATE_AUDIO_TO,
-                            min_speakers,
-                            max_speakers,
-                            tts_voice00,
-                            tts_voice01,
-                        ],
-                        outputs=[video_output],
-                        cache_examples=False,
-                    )
-
-        with gr.Tab(lg_conf["tab_docs"]):
-            with gr.Column():
-                with gr.Accordion("Docs", open=True):
-                    with gr.Column(variant="compact"):
-                        with gr.Column():
-                            input_doc_type = gr.Dropdown(
-                                [
-                                    "WRITE TEXT",
-                                    "SUBMIT DOCUMENT",
-                                    "Find Document Path",
-                                ],
-                                value="SUBMIT DOCUMENT",
-                                label=lg_conf["docs_input_label"],
-                                info=lg_conf["docs_input_info"],
-                            )
-
-                            def swap_visibility(data_type):
-                                if data_type == "WRITE TEXT":
-                                    return (
-                                        gr.update(visible=True, value=""),
-                                        gr.update(visible=False, value=None),
-                                        gr.update(visible=False, value=""),
-                                    )
-                                elif data_type == "SUBMIT DOCUMENT":
-                                    return (
-                                        gr.update(visible=False, value=""),
-                                        gr.update(visible=True, value=None),
-                                        gr.update(visible=False, value=""),
-                                    )
-                                elif data_type == "Find Document Path":
-                                    return (
-                                        gr.update(visible=False, value=""),
-                                        gr.update(visible=False, value=None),
-                                        gr.update(visible=True, value=""),
-                                    )
-
-                            text_docs = gr.Textbox(
-                                label="Text",
-                                value="This is an example",
-                                info="Write a text",
-                                placeholder="...",
-                                lines=5,
-                                visible=False,
-                            )
-                            input_docs = gr.File(
-                                label="Document", visible=True
-                            )
-                            directory_input_docs = gr.Textbox(
-                                visible=False,
-                                label="Document Path",
-                                info="Example: /home/my_doc.pdf",
-                                placeholder="Path goes here...",
-                            )
-                            input_doc_type.change(
-                                fn=swap_visibility,
-                                inputs=input_doc_type,
-                                outputs=[
-                                    text_docs,
-                                    input_docs,
-                                    directory_input_docs,
-                                ],
-                            )
-
-                            gr.HTML()
-
-                            tts_documents = gr.Dropdown(
-                                list(
-                                    filter(
-                                        lambda x: x != "_XTTS_/AUTOMATIC.wav",
-                                        SoniTr.tts_info.tts_list(),
-                                    )
-                                ),
-                                value="en-US-EmmaMultilingualNeural-Female",
-                                label="TTS",
-                                visible=True,
-                                interactive=True,
-                            )
-
-                            gr.HTML()
-
-                            docs_SOURCE_LANGUAGE = gr.Dropdown(
-                                LANGUAGES_LIST[1:],
-                                value="English (en)",
-                                label=lg_conf["sl_label"],
-                                info=lg_conf["docs_source_info"],
-                            )
-                            docs_TRANSLATE_TO = gr.Dropdown(
-                                LANGUAGES_LIST[1:],
-                                value="English (en)",
-                                label=lg_conf["tat_label"],
-                                info=lg_conf["tat_info"],
-                            )
-
-                            with gr.Column():
-                                with gr.Accordion(
-                                    lg_conf["extra_setting"], open=False
-                                ):
-                                    docs_translate_process_dropdown = gr.Dropdown(
-                                        DOCS_TRANSLATION_PROCESS_OPTIONS,
-                                        value=DOCS_TRANSLATION_PROCESS_OPTIONS[
-                                            0
-                                        ],
-                                        label="Translation process",
-                                    )
-
-                                    gr.HTML("<hr></h2>")
-
-                                    docs_output_type = gr.Dropdown(
-                                        DOCS_OUTPUT_TYPE_OPTIONS,
-                                        value=DOCS_OUTPUT_TYPE_OPTIONS[2],
-                                        label="Output type",
-                                    )
-                                    docs_OUTPUT_NAME = gr.Textbox(
-                                        label="Final file name",
-                                        value="",
-                                        info=lg_conf["out_name_info"],
-                                    )
-                                    docs_chunk_size = gr.Number(
-                                        label=lg_conf["chunk_size_label"],
-                                        value=0,
-                                        visible=True,
-                                        interactive=True,
-                                        info=lg_conf["chunk_size_info"],
-                                    )
-                                    gr.HTML("<hr></h2>")
-                                    start_page_gui = gr.Number(
-                                        step=1,
-                                        value=1,
-                                        minimum=1,
-                                        maximum=99999,
-                                        label="Start page",
-                                    )
-                                    end_page_gui = gr.Number(
-                                        step=1,
-                                        value=99999,
-                                        minimum=1,
-                                        maximum=99999,
-                                        label="End page",
-                                    )
-                                    gr.HTML("<hr>Videobook config</h2>")
-                                    videobook_width_gui = gr.Number(
-                                        step=1,
-                                        value=1280,
-                                        minimum=100,
-                                        maximum=4096,
-                                        label="Width",
-                                    )
-                                    videobook_height_gui = gr.Number(
-                                        step=1,
-                                        value=720,
-                                        minimum=100,
-                                        maximum=4096,
-                                        label="Height",
-                                    )
-                                    videobook_bcolor_gui = gr.Dropdown(
-                                        BORDER_COLORS,
-                                        value=BORDER_COLORS[0],
-                                        label="Border color",
-                                    )
-                                    docs_dummy_check = gr.Checkbox(
-                                        True, visible=False
-                                    )
-
-                            with gr.Row():
-                                docs_button = gr.Button(
-                                    lg_conf["docs_button"],
-                                    variant="primary",
-                                )
-                            with gr.Row():
-                                docs_output = gr.File(
-                                    label="Result",
-                                    interactive=False,
-                                )
-
-        with gr.Tab("Custom voice R.V.C. (Optional)"):
-
-            with gr.Column():
-                with gr.Accordion("Get the R.V.C. Models", open=True):
-                    url_links = gr.Textbox(
-                        label="URLs",
-                        value="",
-                        info=lg_conf["cv_url_info"],
-                        placeholder="urls here...",
-                        lines=1,
-                    )
-                    download_finish = gr.HTML()
-                    download_button = gr.Button("DOWNLOAD MODELS")
-
-                    def update_models():
-                        models_path, index_path = upload_model_list()
-
-                        dict_models = {
-                            f"fmodel{i:02d}": gr.update(
-                                choices=models_path
-                            )
-                            for i in range(MAX_TTS+1)
-                        }
-                        dict_index = {
-                            f"findex{i:02d}": gr.update(
-                                choices=index_path, value=None
-                            )
-                            for i in range(MAX_TTS+1)
-                        }
-                        dict_changes = {**dict_models, **dict_index}
-                        return [value for value in dict_changes.values()]
-
-            with gr.Column():
-                with gr.Accordion(lg_conf["replace_title"], open=False):
-                    with gr.Column(variant="compact"):
-                        with gr.Column():
-                            gr.Markdown(lg_conf["sec1_title"])
-                            enable_custom_voice = gr.Checkbox(
-                                False,
-                                label="ENABLE",
-                                info=lg_conf["enable_replace"]
-                            )
-                            workers_custom_voice = gr.Number(
-                                step=1,
-                                value=1,
-                                minimum=1,
-                                maximum=50,
-                                label="workers",
-                                visible=False,
-                            )
-
-                            gr.Markdown(lg_conf["sec2_title"])
-                            gr.Markdown(lg_conf["sec2_subtitle"])
-
-                            PITCH_ALGO_OPT = [
-                                "pm",
-                                "harvest",
-                                "crepe",
-                                "rmvpe",
-                                "rmvpe+",
-                            ]
-
-                            def model_conf():
-                                return gr.Dropdown(
-                                    models_path,
-                                    # value="",
-                                    label="Model",
-                                    visible=True,
-                                    interactive=True,
-                                )
-
-                            def pitch_algo_conf():
-                                return gr.Dropdown(
-                                    PITCH_ALGO_OPT,
-                                    value=PITCH_ALGO_OPT[3],
-                                    label="Pitch algorithm",
-                                    visible=True,
-                                    interactive=True,
-                                )
-
-                            def pitch_lvl_conf():
-                                return gr.Slider(
-                                    label="Pitch level",
-                                    minimum=-24,
-                                    maximum=24,
-                                    step=1,
-                                    value=0,
-                                    visible=True,
-                                    interactive=True,
-                                )
-
-                            def index_conf():
-                                return gr.Dropdown(
-                                    index_path,
-                                    value=None,
-                                    label="Index",
-                                    visible=True,
-                                    interactive=True,
-                                )
-
-                            def index_inf_conf():
-                                return gr.Slider(
-                                    minimum=0,
-                                    maximum=1,
-                                    label="Index influence",
-                                    value=0.75,
-                                )
-
-                            def respiration_filter_conf():
-                                return gr.Slider(
-                                    minimum=0,
-                                    maximum=7,
-                                    label="Respiration median filtering",
-                                    value=3,
-                                    step=1,
-                                    interactive=True,
-                                )
-
-                            def envelope_ratio_conf():
-                                return gr.Slider(
-                                    minimum=0,
-                                    maximum=1,
-                                    label="Envelope ratio",
-                                    value=0.25,
-                                    interactive=True,
-                                )
-
-                            def consonant_protec_conf():
-                                return gr.Slider(
-                                    minimum=0,
-                                    maximum=0.5,
-                                    label="Consonant breath protection",
-                                    value=0.5,
-                                    interactive=True,
-                                )
-
-                            def button_conf(tts_name):
-                                return gr.Button(
-                                    lg_conf["cv_button_apply"]+" "+tts_name,
-                                    variant="primary",
-                                )
-
-                            TTS_TABS = [
-                                'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1)
-                            ]
-
-                            CV_SUBTITLES = [
-                                lg_conf["cv_tts1"],
-                                lg_conf["cv_tts2"],
-                                lg_conf["cv_tts3"],
-                                lg_conf["cv_tts4"],
-                                lg_conf["cv_tts5"],
-                                lg_conf["cv_tts6"],
-                                lg_conf["cv_tts7"],
-                                lg_conf["cv_tts8"],
-                                lg_conf["cv_tts9"],
-                                lg_conf["cv_tts10"],
-                                lg_conf["cv_tts11"],
-                                lg_conf["cv_tts12"],
-                            ]
-
-                            configs_storage = []
-
-                            for i in range(MAX_TTS):  # Loop from 00 to 11
-                                with gr.Accordion(CV_SUBTITLES[i], open=False):
-                                    gr.Markdown(TTS_TABS[i])
-                                    with gr.Column():
-                                        tag_gui = gr.Textbox(
-                                            value=TTS_TABS[i], visible=False
-                                        )
-                                        model_gui = model_conf()
-                                        pitch_algo_gui = pitch_algo_conf()
-                                        pitch_lvl_gui = pitch_lvl_conf()
-                                        index_gui = index_conf()
-                                        index_inf_gui = index_inf_conf()
-                                        rmf_gui = respiration_filter_conf()
-                                        er_gui = envelope_ratio_conf()
-                                        cbp_gui = consonant_protec_conf()
-
-                                        with gr.Row(variant="compact"):
-                                            button_config = button_conf(
-                                                TTS_TABS[i]
-                                            )
-
-                                            confirm_conf = gr.HTML()
-
-                                        button_config.click(
-                                            SoniTr.vci.apply_conf,
-                                            inputs=[
-                                                tag_gui,
-                                                model_gui,
-                                                pitch_algo_gui,
-                                                pitch_lvl_gui,
-                                                index_gui,
-                                                index_inf_gui,
-                                                rmf_gui,
-                                                er_gui,
-                                                cbp_gui,
-                                            ],
-                                            outputs=[confirm_conf],
-                                        )
-
-                                        configs_storage.append({
-                                            "tag": tag_gui,
-                                            "model": model_gui,
-                                            "index": index_gui,
-                                        })
-
-                with gr.Column():
-                    with gr.Accordion("Test R.V.C.", open=False):
-                        with gr.Row(variant="compact"):
-                            text_test = gr.Textbox(
-                                label="Text",
-                                value="This is an example",
-                                info="write a text",
-                                placeholder="...",
-                                lines=5,
-                            )
-                            with gr.Column():
-                                tts_test = gr.Dropdown(
-                                    sorted(SoniTr.tts_info.list_edge),
-                                    value="en-GB-ThomasNeural-Male",
-                                    label="TTS",
-                                    visible=True,
-                                    interactive=True,
-                                )
-                                model_test = model_conf()
-                                index_test = index_conf()
-                                pitch_test = pitch_lvl_conf()
-                                pitch_alg_test = pitch_algo_conf()
-                        with gr.Row(variant="compact"):
-                            button_test = gr.Button("Test audio")
-
-                        with gr.Column():
-                            with gr.Row():
-                                original_ttsvoice = gr.Audio()
-                                ttsvoice = gr.Audio()
-
-                            button_test.click(
-                                SoniTr.vci.make_test,
-                                inputs=[
-                                    text_test,
-                                    tts_test,
-                                    model_test,
-                                    index_test,
-                                    pitch_test,
-                                    pitch_alg_test,
-                                ],
-                                outputs=[ttsvoice, original_ttsvoice],
-                            )
-
-                    download_button.click(
-                        download_list,
-                        [url_links],
-                        [download_finish],
-                        queue=False
-                    ).then(
-                        update_models,
-                        [],
-                        [
-                            elem["model"] for elem in configs_storage
-                        ] + [model_test] + [
-                            elem["index"] for elem in configs_storage
-                        ] + [index_test],
-                    )
-
-        with gr.Tab(lg_conf["tab_help"]):
-            gr.Markdown(lg_conf["tutorial"])
-            gr.Markdown(news)
-
-            def play_sound_alert(play_sound):
-
-                if not play_sound:
-                    return None
-
-                # silent_sound = "assets/empty_audio.mp3"
-                sound_alert = "assets/sound_alert.mp3"
-
-                time.sleep(0.25)
-                # yield silent_sound
-                yield None
-
-                time.sleep(0.25)
-                yield sound_alert
-
-            sound_alert_notification = gr.Audio(
-                value=None,
-                type="filepath",
-                format="mp3",
-                autoplay=True,
-                visible=False,
-            )
-
-        if logs_in_gui:
-            logger.info("Logs in gui need public url")
-
-            class Logger:
-                def __init__(self, filename):
-                    self.terminal = sys.stdout
-                    self.log = open(filename, "w")
-
-                def write(self, message):
-                    self.terminal.write(message)
-                    self.log.write(message)
-
-                def flush(self):
-                    self.terminal.flush()
-                    self.log.flush()
-
-                def isatty(self):
-                    return False
-
-            sys.stdout = Logger("output.log")
-
-            def read_logs():
-                sys.stdout.flush()
-                with open("output.log", "r") as f:
-                    return f.read()
-
-            with gr.Accordion("Logs", open=False):
-                logs = gr.Textbox(label=">>>")
-                app.load(read_logs, None, logs, every=1)
-
-        if SoniTr.tts_info.xtts_enabled:
-            # Update tts list
-            def update_tts_list():
-                update_dict = {
-                    f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list())
-                    for i in range(MAX_TTS)
-                }
-                update_dict["tts_documents"] = gr.update(
-                    choices=list(
-                        filter(
-                            lambda x: x != "_XTTS_/AUTOMATIC.wav",
-                            SoniTr.tts_info.tts_list(),
-                        )
-                    )
-                )
-                return [value for value in update_dict.values()]
-
-            create_xtts_wav.click(
-                create_wav_file_vc,
-                inputs=[
-                    wav_speaker_name,
-                    wav_speaker_file,
-                    wav_speaker_start,
-                    wav_speaker_end,
-                    wav_speaker_dir,
-                    wav_speaker_dereverb,
-                ],
-                outputs=[wav_speaker_output],
-            ).then(
-                update_tts_list,
-                None,
-                [
-                    tts_voice00,
-                    tts_voice01,
-                    tts_voice02,
-                    tts_voice03,
-                    tts_voice04,
-                    tts_voice05,
-                    tts_voice06,
-                    tts_voice07,
-                    tts_voice08,
-                    tts_voice09,
-                    tts_voice10,
-                    tts_voice11,
-                    tts_documents,
-                ],
-            )
-
-        # Run translate text
-        subs_button.click(
-            SoniTr.batch_multilingual_media_conversion,
-            inputs=[
-                video_input,
-                blink_input,
-                directory_input,
-                HFKEY,
-                PREVIEW,
-                WHISPER_MODEL_SIZE,
-                batch_size,
-                compute_type,
-                SOURCE_LANGUAGE,
-                TRANSLATE_AUDIO_TO,
-                min_speakers,
-                max_speakers,
-                tts_voice00,
-                tts_voice01,
-                tts_voice02,
-                tts_voice03,
-                tts_voice04,
-                tts_voice05,
-                tts_voice06,
-                tts_voice07,
-                tts_voice08,
-                tts_voice09,
-                tts_voice10,
-                tts_voice11,
-                VIDEO_OUTPUT_NAME,
-                AUDIO_MIX,
-                audio_accelerate,
-                acceleration_rate_regulation_gui,
-                volume_original_mix,
-                volume_translated_mix,
-                sub_type_output,
-                edit_sub_check,  # TRUE BY DEFAULT
-                dummy_false_check,  # dummy false
-                subs_edit_space,
-                avoid_overlap_gui,
-                vocal_refinement_gui,
-                literalize_numbers_gui,
-                segment_duration_limit_gui,
-                diarization_process_dropdown,
-                translate_process_dropdown,
-                input_srt,
-                main_output_type,
-                main_voiceless_track,
-                voice_imitation_gui,
-                voice_imitation_max_segments_gui,
-                voice_imitation_vocals_dereverb_gui,
-                voice_imitation_remove_previous_gui,
-                voice_imitation_method_gui,
-                wav_speaker_dereverb,
-                text_segmentation_scale_gui,
-                divide_text_segments_by_gui,
-                soft_subtitles_to_video_gui,
-                burn_subtitles_to_video_gui,
-                enable_cache_gui,
-                enable_custom_voice,
-                workers_custom_voice,
-                is_gui_dummy_check,
-            ],
-            outputs=subs_edit_space,
-        ).then(
-            play_sound_alert, [play_sound_gui], [sound_alert_notification]
-        )
-
-        # Run translate tts and complete
-        video_button.click(
-            SoniTr.batch_multilingual_media_conversion,
-            inputs=[
-                video_input,
-                blink_input,
-                directory_input,
-                HFKEY,
-                PREVIEW,
-                WHISPER_MODEL_SIZE,
-                batch_size,
-                compute_type,
-                SOURCE_LANGUAGE,
-                TRANSLATE_AUDIO_TO,
-                min_speakers,
-                max_speakers,
-                tts_voice00,
-                tts_voice01,
-                tts_voice02,
-                tts_voice03,
-                tts_voice04,
-                tts_voice05,
-                tts_voice06,
-                tts_voice07,
-                tts_voice08,
-                tts_voice09,
-                tts_voice10,
-                tts_voice11,
-                VIDEO_OUTPUT_NAME,
-                AUDIO_MIX,
-                audio_accelerate,
-                acceleration_rate_regulation_gui,
-                volume_original_mix,
-                volume_translated_mix,
-                sub_type_output,
-                dummy_false_check,
-                edit_sub_check,
-                subs_edit_space,
-                avoid_overlap_gui,
-                vocal_refinement_gui,
-                literalize_numbers_gui,
-                segment_duration_limit_gui,
-                diarization_process_dropdown,
-                translate_process_dropdown,
-                input_srt,
-                main_output_type,
-                main_voiceless_track,
-                voice_imitation_gui,
-                voice_imitation_max_segments_gui,
-                voice_imitation_vocals_dereverb_gui,
-                voice_imitation_remove_previous_gui,
-                voice_imitation_method_gui,
-                wav_speaker_dereverb,
-                text_segmentation_scale_gui,
-                divide_text_segments_by_gui,
-                soft_subtitles_to_video_gui,
-                burn_subtitles_to_video_gui,
-                enable_cache_gui,
-                enable_custom_voice,
-                workers_custom_voice,
-                is_gui_dummy_check,
-            ],
-            outputs=video_output,
-            trigger_mode="multiple",
-        ).then(
-            play_sound_alert, [play_sound_gui], [sound_alert_notification]
-        )
-
-        # Run docs process
-        docs_button.click(
-            SoniTr.multilingual_docs_conversion,
-            inputs=[
-                text_docs,
-                input_docs,
-                directory_input_docs,
-                docs_SOURCE_LANGUAGE,
-                docs_TRANSLATE_TO,
-                tts_documents,
-                docs_OUTPUT_NAME,
-                docs_translate_process_dropdown,
-                docs_output_type,
-                docs_chunk_size,
-                enable_custom_voice,
-                workers_custom_voice,
-                start_page_gui,
-                end_page_gui,
-                videobook_width_gui,
-                videobook_height_gui,
-                videobook_bcolor_gui,
-                docs_dummy_check,
-            ],
-            outputs=docs_output,
-            trigger_mode="multiple",
-        ).then(
-            play_sound_alert, [play_sound_gui], [sound_alert_notification]
-        )
-
-    return app
-
-
-def get_language_config(language_data, language=None, base_key="english"):
-    base_lang = language_data.get(base_key)
-
-    if language not in language_data:
-        logger.error(
-            f"Language {language} not found, defaulting to {base_key}"
-        )
-        return base_lang
-
-    lg_conf = language_data.get(language, {})
-    lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf)
-
-    return lg_conf
-
-
-def create_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        "--theme",
-        type=str,
-        default="Taithrah/Minimal",
-        help=(
-            "Specify the theme; find themes in "
-            "https://huggingface.co/spaces/gradio/theme-gallery;"
-            " Example: --theme aliabid94/new-theme"
-        ),
-    )
-    parser.add_argument(
-        "--public_url",
-        action="store_true",
-        default=False,
-        help="Enable public link",
-    )
-    parser.add_argument(
-        "--logs_in_gui",
-        action="store_true",
-        default=False,
-        help="Displays the operations performed in Logs",
-    )
-    parser.add_argument(
-        "--verbosity_level",
-        type=str,
-        default="info",
-        help=(
-            "Set logger verbosity level: "
-            "debug, info, warning, error, or critical"
-        ),
-    )
-    parser.add_argument(
-        "--language",
-        type=str,
-        default="english",
-        help=" Select the language of the interface: english, spanish",
-    )
-    parser.add_argument(
-        "--cpu_mode",
-        action="store_true",
-        default=False,
-        help="Enable CPU mode to run the program without utilizing GPU acceleration.",
-    )
-    return parser
-
-
-if __name__ == "__main__":
-
-    parser = create_parser()
-
-    args = parser.parse_args()
-    # Simulating command-line arguments
-    # args_list = "--theme aliabid94/new-theme --public_url".split()
-    # args = parser.parse_args(args_list)
-
-    set_logging_level(args.verbosity_level)
-
-    for id_model in UVR_MODELS:
-        download_manager(
-            os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
-        )
-
-    models_path, index_path = upload_model_list()
-
-    SoniTr = SoniTranslate(cpu_mode=args.cpu_mode if os.environ.get("ZERO_GPU") != "TRUE" else "cpu")
-
-    lg_conf = get_language_config(language_data, language=args.language)
-
-    app = create_gui(args.theme, logs_in_gui=args.logs_in_gui)
-
-    app.queue()
-
-    app.launch(
-        max_threads=1,
-        share=args.public_url,
-        show_error=True,
-        quiet=False,
-        debug=(True if logger.isEnabledFor(logging.DEBUG) else False),
-    )
+import gradio as gr
+from soni_translate.logging_setup import (
+    logger,
+    set_logging_level,
+    configure_logging_libs,
+); configure_logging_libs() # noqa
+import whisperx
+import torch
+import os
+from soni_translate.audio_segments import create_translated_audio
+from soni_translate.text_to_speech import (
+    audio_segmentation_to_voice,
+    edge_tts_voices_list,
+    coqui_xtts_voices_list,
+    piper_tts_voices_list,
+    create_wav_file_vc,
+    accelerate_segments,
+)
+from soni_translate.translate_segments import (
+    translate_text,
+    TRANSLATION_PROCESS_OPTIONS,
+    DOCS_TRANSLATION_PROCESS_OPTIONS
+)
+from soni_translate.preprocessor import (
+    audio_video_preprocessor,
+    audio_preprocessor,
+)
+from soni_translate.postprocessor import (
+    OUTPUT_TYPE_OPTIONS,
+    DOCS_OUTPUT_TYPE_OPTIONS,
+    sound_separate,
+    get_no_ext_filename,
+    media_out,
+    get_subtitle_speaker,
+)
+from soni_translate.language_configuration import (
+    LANGUAGES,
+    UNIDIRECTIONAL_L_LIST,
+    LANGUAGES_LIST,
+    BARK_VOICES_LIST,
+    VITS_VOICES_LIST,
+    OPENAI_TTS_MODELS,
+)
+from soni_translate.utils import (
+    remove_files,
+    download_list,
+    upload_model_list,
+    download_manager,
+    run_command,
+    is_audio_file,
+    is_subtitle_file,
+    copy_files,
+    get_valid_files,
+    get_link_list,
+    remove_directory_contents,
+)
+from soni_translate.mdx_net import (
+    UVR_MODELS,
+    MDX_DOWNLOAD_LINK,
+    mdxnet_models_dir,
+)
+from soni_translate.speech_segmentation import (
+    ASR_MODEL_OPTIONS,
+    COMPUTE_TYPE_GPU,
+    COMPUTE_TYPE_CPU,
+    find_whisper_models,
+    transcribe_speech,
+    align_speech,
+    diarize_speech,
+    diarization_models,
+)
+from soni_translate.text_multiformat_processor import (
+    BORDER_COLORS,
+    srt_file_to_segments,
+    document_preprocessor,
+    determine_chunk_size,
+    plain_text_to_segments,
+    segments_to_plain_text,
+    process_subtitles,
+    linguistic_level_segments,
+    break_aling_segments,
+    doc_to_txtximg_pages,
+    page_data_to_segments,
+    update_page_data,
+    fix_timestamps_docs,
+    create_video_from_images,
+    merge_video_and_audio,
+)
+from soni_translate.languages_gui import language_data, news
+import copy
+import logging
+import json
+from pydub import AudioSegment
+from voice_main import ClassVoices
+import argparse
+import time
+import hashlib
+import sys
+
+directories = [
+    "downloads",
+    "logs",
+    "weights",
+    "clean_song_output",
+    "_XTTS_",
+    f"audio2{os.sep}audio",
+    "audio",
+    "outputs",
+]
+[
+    os.makedirs(directory)
+    for directory in directories
+    if not os.path.exists(directory)
+]
+
+
+class TTS_Info:
+    def __init__(self, piper_enabled, xtts_enabled):
+        self.list_edge = edge_tts_voices_list()
+        self.list_bark = list(BARK_VOICES_LIST.keys())
+        self.list_vits = list(VITS_VOICES_LIST.keys())
+        self.list_openai_tts = OPENAI_TTS_MODELS
+        self.piper_enabled = piper_enabled
+        self.list_vits_onnx = (
+            piper_tts_voices_list() if self.piper_enabled else []
+        )
+        self.xtts_enabled = xtts_enabled
+
+    def tts_list(self):
+        self.list_coqui_xtts = (
+            coqui_xtts_voices_list() if self.xtts_enabled else []
+        )
+        list_tts = self.list_coqui_xtts + sorted(
+            self.list_edge
+            + self.list_bark
+            + self.list_vits
+            + self.list_openai_tts
+            + self.list_vits_onnx
+        )
+        return list_tts
+
+
+def prog_disp(msg, percent, is_gui, progress=None):
+    logger.info(msg)
+    if is_gui:
+        progress(percent, desc=msg)
+
+
+def warn_disp(wrn_lang, is_gui):
+    logger.warning(wrn_lang)
+    if is_gui:
+        gr.Warning(wrn_lang)
+
+
+class SoniTrCache:
+    def __init__(self):
+        self.cache = {
+            'media': [[]],
+            'refine_vocals': [],
+            'transcript_align': [],
+            'break_align': [],
+            'diarize': [],
+            'translate': [],
+            'subs_and_edit': [],
+            'tts': [],
+            'acc_and_vc': [],
+            'mix_aud': [],
+            'output': []
+        }
+
+        self.cache_data = {
+            'media': [],
+            'refine_vocals': [],
+            'transcript_align': [],
+            'break_align': [],
+            'diarize': [],
+            'translate': [],
+            'subs_and_edit': [],
+            'tts': [],
+            'acc_and_vc': [],
+            'mix_aud': [],
+            'output': []
+        }
+
+        self.cache_keys = list(self.cache.keys())
+        self.first_task = self.cache_keys[0]
+        self.last_task = self.cache_keys[-1]
+
+        self.pre_step = None
+        self.pre_params = []
+
+    def set_variable(self, variable_name, value):
+        setattr(self, variable_name, value)
+
+    def task_in_cache(self, step: str, params: list, previous_step_data: dict):
+
+        self.pre_step_cache = None
+
+        if step == self.first_task:
+            self.pre_step = None
+
+        if self.pre_step:
+            self.cache[self.pre_step] = self.pre_params
+
+            # Fill data in cache
+            self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data)
+
+        self.pre_params = params
+        # logger.debug(f"Step: {str(step)}, Cache params: {str(self.cache)}")
+        if params == self.cache[step]:
+            logger.debug(f"In cache: {str(step)}")
+
+            # Set the var needed for next step
+            # Recovery from cache_data the current step
+            for key, value in self.cache_data[step].items():
+                self.set_variable(key, copy.deepcopy(value))
+                logger.debug(
+                    f"Chache load: {str(key)}"
+                )
+
+            self.pre_step = step
+            return True
+
+        else:
+            logger.debug(f"Flush next and caching {str(step)}")
+            selected_index = self.cache_keys.index(step)
+
+            for idx, key in enumerate(self.cache.keys()):
+                if idx >= selected_index:
+                    self.cache[key] = []
+                    self.cache_data[key] = {}
+
+            # The last is now previous
+            self.pre_step = step
+            return False
+
+    def clear_cache(self, media, force=False):
+
+        self.cache["media"] = (
+            self.cache["media"] if len(self.cache["media"]) else [[]]
+        )
+
+        if media != self.cache["media"][0] or force:
+
+            # Clear cache
+            self.cache = {key: [] for key in self.cache}
+            self.cache["media"] = [[]]
+
+            logger.info("Cache flushed")
+
+
+def get_hash(filepath):
+    with open(filepath, 'rb') as f:
+        file_hash = hashlib.blake2b()
+        while chunk := f.read(8192):
+            file_hash.update(chunk)
+
+    return file_hash.hexdigest()[:18]
+
+
+def check_openai_api_key():
+    if not os.environ.get("OPENAI_API_KEY"):
+        raise ValueError(
+            "To use GPT for translation, please set up your OpenAI API key "
+            "as an environment variable in Linux as follows: "
+            "export OPENAI_API_KEY='your-api-key-here'. Or change the "
+            "translation process in Advanced settings."
+        )
+
+
+class SoniTranslate(SoniTrCache):
+    def __init__(self, cpu_mode=False):
+        super().__init__()
+        if cpu_mode:
+            os.environ["SONITR_DEVICE"] = "cpu"
+        else:
+            os.environ["SONITR_DEVICE"] = (
+                "cuda" if torch.cuda.is_available() else "cpu"
+            )
+
+        self.device = os.environ.get("SONITR_DEVICE")
+        self.result_diarize = None
+        self.align_language = None
+        self.result_source_lang = None
+        self.edit_subs_complete = False
+        self.voiceless_id = None
+        self.burn_subs_id = None
+
+        self.vci = ClassVoices(only_cpu=cpu_mode)
+
+        self.tts_voices = self.get_tts_voice_list()
+
+        logger.info(f"Working in: {self.device}")
+
+    def get_tts_voice_list(self):
+        try:
+            from piper import PiperVoice  # noqa
+
+            piper_enabled = True
+            logger.info("PIPER TTS enabled")
+        except Exception as error:
+            logger.debug(str(error))
+            piper_enabled = False
+            logger.info("PIPER TTS disabled")
+        try:
+            from TTS.api import TTS  # noqa
+
+            xtts_enabled = True
+            logger.info("Coqui XTTS enabled")
+            logger.info(
+                "In this app, by using Coqui TTS (text-to-speech), you "
+                "acknowledge and agree to the license.\n"
+                "You confirm that you have read, understood, and agreed "
+                "to the Terms and Conditions specified at the following "
+                "link:\nhttps://coqui.ai/cpml.txt."
+            )
+            os.environ["COQUI_TOS_AGREED"] = "1"
+        except Exception as error:
+            logger.debug(str(error))
+            xtts_enabled = False
+            logger.info("Coqui XTTS disabled")
+
+        self.tts_info = TTS_Info(piper_enabled, xtts_enabled)
+
+        return self.tts_info.tts_list()
+
+    def batch_multilingual_media_conversion(self, *kwargs):
+        # logger.debug(str(kwargs))
+
+        media_file_arg = kwargs[0] if kwargs[0] is not None else []
+
+        link_media_arg = kwargs[1]
+        link_media_arg = [x.strip() for x in link_media_arg.split(',')]
+        link_media_arg = get_link_list(link_media_arg)
+
+        path_arg = kwargs[2]
+        path_arg = [x.strip() for x in path_arg.split(',')]
+        path_arg = get_valid_files(path_arg)
+
+        edit_text_arg = kwargs[31]
+        get_text_arg = kwargs[32]
+
+        is_gui_arg = kwargs[-1]
+
+        kwargs = kwargs[3:]
+
+        media_batch = media_file_arg + link_media_arg + path_arg
+        media_batch = list(filter(lambda x: x != "", media_batch))
+        media_batch = media_batch if media_batch else [None]
+        logger.debug(str(media_batch))
+
+        remove_directory_contents("outputs")
+
+        if edit_text_arg or get_text_arg:
+            return self.multilingual_media_conversion(
+                media_batch[0], "", "", *kwargs
+            )
+
+        if "SET_LIMIT" == os.getenv("DEMO"):
+            media_batch = [media_batch[0]]
+
+        result = []
+        for media in media_batch:
+            # Call the nested function with the parameters
+            output_file = self.multilingual_media_conversion(
+                media, "", "", *kwargs
+            )
+
+            if isinstance(output_file, str):
+                output_file = [output_file]
+            result.extend(output_file)
+
+            if is_gui_arg and len(media_batch) > 1:
+                gr.Info(f"Done: {os.path.basename(output_file[0])}")
+
+        return result
+
+    def multilingual_media_conversion(
+        self,
+        media_file=None,
+        link_media="",
+        directory_input="",
+        YOUR_HF_TOKEN="",
+        preview=False,
+        transcriber_model="large-v3",
+        batch_size=4,
+        compute_type="auto",
+        origin_language="Automatic detection",
+        target_language="English (en)",
+        min_speakers=1,
+        max_speakers=1,
+        tts_voice00="en-US-EmmaMultilingualNeural-Female",
+        tts_voice01="en-US-AndrewMultilingualNeural-Male",
+        tts_voice02="en-US-AvaMultilingualNeural-Female",
+        tts_voice03="en-US-BrianMultilingualNeural-Male",
+        tts_voice04="de-DE-SeraphinaMultilingualNeural-Female",
+        tts_voice05="de-DE-FlorianMultilingualNeural-Male",
+        tts_voice06="fr-FR-VivienneMultilingualNeural-Female",
+        tts_voice07="fr-FR-RemyMultilingualNeural-Male",
+        tts_voice08="en-US-EmmaMultilingualNeural-Female",
+        tts_voice09="en-US-AndrewMultilingualNeural-Male",
+        tts_voice10="en-US-EmmaMultilingualNeural-Female",
+        tts_voice11="en-US-AndrewMultilingualNeural-Male",
+        video_output_name="",
+        mix_method_audio="Adjusting volumes and mixing audio",
+        max_accelerate_audio=2.1,
+        acceleration_rate_regulation=False,
+        volume_original_audio=0.25,
+        volume_translated_audio=1.80,
+        output_format_subtitle="srt",
+        get_translated_text=False,
+        get_video_from_text_json=False,
+        text_json="{}",
+        avoid_overlap=False,
+        vocal_refinement=False,
+        literalize_numbers=True,
+        segment_duration_limit=15,
+        diarization_model="pyannote_2.1",
+        translate_process="google_translator_batch",
+        subtitle_file=None,
+        output_type="video (mp4)",
+        voiceless_track=False,
+        voice_imitation=False,
+        voice_imitation_max_segments=3,
+        voice_imitation_vocals_dereverb=False,
+        voice_imitation_remove_previous=True,
+        voice_imitation_method="freevc",
+        dereverb_automatic_xtts=True,
+        text_segmentation_scale="sentence",
+        divide_text_segments_by="",
+        soft_subtitles_to_video=True,
+        burn_subtitles_to_video=False,
+        enable_cache=True,
+        custom_voices=False,
+        custom_voices_workers=1,
+        is_gui=False,
+        progress=gr.Progress(),
+    ):
+        if not YOUR_HF_TOKEN:
+            YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
+            if diarization_model == "disable" or max_speakers == 1:
+                if YOUR_HF_TOKEN is None:
+                    YOUR_HF_TOKEN = ""
+            elif not YOUR_HF_TOKEN:
+                raise ValueError("No valid Hugging Face token")
+            else:
+                os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN
+
+        if (
+            "gpt" in translate_process
+            or transcriber_model == "OpenAI_API_Whisper"
+            or "OpenAI-TTS" in tts_voice00
+        ):
+            check_openai_api_key()
+
+        if media_file is None:
+            media_file = (
+                directory_input
+                if os.path.exists(directory_input)
+                else link_media
+            )
+        media_file = (
+            media_file if isinstance(media_file, str) else media_file.name
+        )
+
+        if is_subtitle_file(media_file):
+            subtitle_file = media_file
+            media_file = ""
+
+        if media_file is None:
+            media_file = ""
+
+        if not origin_language:
+            origin_language = "Automatic detection"
+
+        if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file:
+            raise ValueError(
+                f"The language '{origin_language}' "
+                "is not supported for transcription (ASR)."
+            )
+
+        if get_translated_text:
+            self.edit_subs_complete = False
+        if get_video_from_text_json:
+            if not self.edit_subs_complete:
+                raise ValueError("Generate the transcription first.")
+
+        if (
+            ("sound" in output_type or output_type == "raw media")
+            and (get_translated_text or get_video_from_text_json)
+        ):
+            raise ValueError(
+                "Please disable 'edit generate subtitles' "
+                f"first to acquire the {output_type}."
+            )
+
+        TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
+        SOURCE_LANGUAGE = LANGUAGES[origin_language]
+
+        if (
+            transcriber_model == "OpenAI_API_Whisper"
+            and SOURCE_LANGUAGE == "zh-TW"
+        ):
+            logger.warning(
+                "OpenAI API Whisper only supports Chinese (Simplified)."
+            )
+            SOURCE_LANGUAGE = "zh"
+
+        if (
+            text_segmentation_scale in ["word", "character"]
+            and "subtitle" not in output_type
+        ):
+            wrn_lang = (
+                "Text segmentation by words or characters is typically"
+                " used for generating subtitles. If subtitles are not the"
+                " intended output, consider selecting 'sentence' "
+                "segmentation method to ensure optimal results."
+
+            )
+            warn_disp(wrn_lang, is_gui)
+
+        if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
+            wrn_lang = (
+                "Make sure to select a 'TTS Speaker' suitable for"
+                " the translation language to avoid errors with the TTS."
+            )
+            warn_disp(wrn_lang, is_gui)
+
+        if "_XTTS_" in tts_voice00 and voice_imitation:
+            wrn_lang = (
+                "When you select XTTS, it is advisable "
+                "to disable Voice Imitation."
+            )
+            warn_disp(wrn_lang, is_gui)
+
+        if custom_voices and voice_imitation:
+            wrn_lang = (
+                "When you use R.V.C. models, it is advisable"
+                " to disable Voice Imitation."
+            )
+            warn_disp(wrn_lang, is_gui)
+
+        if not media_file and not subtitle_file:
+            raise ValueError(
+                "Specifify a media or SRT file in advanced settings"
+            )
+
+        if subtitle_file:
+            subtitle_file = (
+                subtitle_file
+                if isinstance(subtitle_file, str)
+                else subtitle_file.name
+            )
+
+        if subtitle_file and SOURCE_LANGUAGE == "Automatic detection":
+            raise Exception(
+                "To use an SRT file, you need to specify its "
+                "original language (Source language)"
+            )
+
+        if not media_file and subtitle_file:
+            diarization_model = "disable"
+            media_file = "audio_support.wav"
+            if not get_video_from_text_json:
+                remove_files(media_file)
+                srt_data = srt_file_to_segments(subtitle_file)
+                total_duration = srt_data["segments"][-1]["end"] + 30.
+                support_audio = AudioSegment.silent(
+                    duration=int(total_duration * 1000)
+                )
+                support_audio.export(
+                    media_file, format="wav"
+                )
+                logger.info("Supporting audio for the SRT file, created.")
+
+        if "SET_LIMIT" == os.getenv("DEMO"):
+            preview = True
+            mix_method_audio = "Adjusting volumes and mixing audio"
+            transcriber_model = "medium"
+            logger.info(
+                "DEMO; set preview=True; Generation is limited to "
+                "10 seconds to prevent CPU errors. No limitations with GPU.\n"
+                "DEMO; set Adjusting volumes and mixing audio\n"
+                "DEMO; set whisper model to medium"
+            )
+
+        # Check GPU
+        if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU:
+            logger.info("Compute type changed to float32")
+            compute_type = "float32"
+
+        base_video_file = "Video.mp4"
+        base_audio_wav = "audio.wav"
+        dub_audio_file = "audio_dub_solo.ogg"
+        vocals_audio_file = "audio_Vocals_DeReverb.wav"
+        voiceless_audio_file = "audio_Voiceless.wav"
+        mix_audio_file = "audio_mix.mp3"
+        vid_subs = "video_subs_file.mp4"
+        video_output_file = "video_dub.mp4"
+
+        if os.path.exists(media_file):
+            media_base_hash = get_hash(media_file)
+        else:
+            media_base_hash = media_file
+        self.clear_cache(media_base_hash, force=(not enable_cache))
+
+        if not get_video_from_text_json:
+            self.result_diarize = (
+                self.align_language
+            ) = self.result_source_lang = None
+            if not self.task_in_cache("media", [media_base_hash, preview], {}):
+                if is_audio_file(media_file):
+                    prog_disp(
+                        "Processing audio...", 0.15, is_gui, progress=progress
+                    )
+                    audio_preprocessor(preview, media_file, base_audio_wav)
+                else:
+                    prog_disp(
+                        "Processing video...", 0.15, is_gui, progress=progress
+                    )
+                    audio_video_preprocessor(
+                        preview, media_file, base_video_file, base_audio_wav
+                    )
+                logger.debug("Set file complete.")
+
+            if "sound" in output_type:
+                prog_disp(
+                    "Separating sounds in the file...",
+                    0.50,
+                    is_gui,
+                    progress=progress
+                )
+                separate_out = sound_separate(base_audio_wav, output_type)
+                final_outputs = []
+                for out in separate_out:
+                    final_name = media_out(
+                        media_file,
+                        f"{get_no_ext_filename(out)}",
+                        video_output_name,
+                        "wav",
+                        file_obj=out,
+                    )
+                    final_outputs.append(final_name)
+                logger.info(f"Done: {str(final_outputs)}")
+                return final_outputs
+
+            if output_type == "raw media":
+                output = media_out(
+                    media_file,
+                    "raw_media",
+                    video_output_name,
+                    "wav" if is_audio_file(media_file) else "mp4",
+                    file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
+                )
+                logger.info(f"Done: {output}")
+                return output
+
+            if not self.task_in_cache("refine_vocals", [vocal_refinement], {}):
+                self.vocals = None
+                if vocal_refinement:
+                    try:
+                        from soni_translate.mdx_net import process_uvr_task
+                        _, _, _, _, file_vocals = process_uvr_task(
+                            orig_song_path=base_audio_wav,
+                            main_vocals=False,
+                            dereverb=True,
+                            remove_files_output_dir=True,
+                        )
+                        remove_files(vocals_audio_file)
+                        copy_files(file_vocals, ".")
+                        self.vocals = vocals_audio_file
+                    except Exception as error:
+                        logger.error(str(error))
+
+            if not self.task_in_cache("transcript_align", [
+                subtitle_file,
+                SOURCE_LANGUAGE,
+                transcriber_model,
+                compute_type,
+                batch_size,
+                literalize_numbers,
+                segment_duration_limit,
+                (
+                    "l_unit"
+                    if text_segmentation_scale in ["word", "character"]
+                    and subtitle_file
+                    else "sentence"
+                )
+            ], {"vocals": self.vocals}):
+                if subtitle_file:
+                    prog_disp(
+                        "From SRT file...", 0.30, is_gui, progress=progress
+                    )
+                    audio = whisperx.load_audio(
+                        base_audio_wav if not self.vocals else self.vocals
+                    )
+                    self.result = srt_file_to_segments(subtitle_file)
+                    self.result["language"] = SOURCE_LANGUAGE
+                else:
+                    prog_disp(
+                        "Transcribing...", 0.30, is_gui, progress=progress
+                    )
+                    SOURCE_LANGUAGE = (
+                        None
+                        if SOURCE_LANGUAGE == "Automatic detection"
+                        else SOURCE_LANGUAGE
+                    )
+                    audio, self.result = transcribe_speech(
+                        base_audio_wav if not self.vocals else self.vocals,
+                        transcriber_model,
+                        compute_type,
+                        batch_size,
+                        SOURCE_LANGUAGE,
+                        literalize_numbers,
+                        segment_duration_limit,
+                    )
+                logger.debug(
+                    "Transcript complete, "
+                    f"segments count {len(self.result['segments'])}"
+                )
+
+                self.align_language = self.result["language"]
+                if (
+                    not subtitle_file
+                    or text_segmentation_scale in ["word", "character"]
+                ):
+                    prog_disp("Aligning...", 0.45, is_gui, progress=progress)
+                    try:
+                        if self.align_language in ["vi"]:
+                            logger.info(
+                                "Deficient alignment for the "
+                                f"{self.align_language} language, skipping the"
+                                " process. It is suggested to reduce the "
+                                "duration of the segments as an alternative."
+                            )
+                        else:
+                            self.result = align_speech(audio, self.result)
+                            logger.debug(
+                                "Align complete, "
+                                f"segments count {len(self.result['segments'])}"
+                            )
+                    except Exception as error:
+                        logger.error(str(error))
+
+            if self.result["segments"] == []:
+                raise ValueError("No active speech found in audio")
+
+            if not self.task_in_cache("break_align", [
+                divide_text_segments_by,
+                text_segmentation_scale,
+                self.align_language
+            ], {
+                "result": self.result,
+                "align_language": self.align_language
+            }):
+                if self.align_language in ["ja", "zh", "zh-TW"]:
+                    divide_text_segments_by += "|!|?|...|。"
+                if text_segmentation_scale in ["word", "character"]:
+                    self.result = linguistic_level_segments(
+                        self.result,
+                        text_segmentation_scale,
+                    )
+                elif divide_text_segments_by:
+                    try:
+                        self.result = break_aling_segments(
+                            self.result,
+                            break_characters=divide_text_segments_by,
+                        )
+                    except Exception as error:
+                        logger.error(str(error))
+
+            if not self.task_in_cache("diarize", [
+                min_speakers,
+                max_speakers,
+                YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2],
+                diarization_model
+            ], {
+                "result": self.result
+            }):
+                prog_disp("Diarizing...", 0.60, is_gui, progress=progress)
+                diarize_model_select = diarization_models[diarization_model]
+                self.result_diarize = diarize_speech(
+                    base_audio_wav if not self.vocals else self.vocals,
+                    self.result,
+                    min_speakers,
+                    max_speakers,
+                    YOUR_HF_TOKEN,
+                    diarize_model_select,
+                )
+                logger.debug("Diarize complete")
+            self.result_source_lang = copy.deepcopy(self.result_diarize)
+
+            if not self.task_in_cache("translate", [
+                TRANSLATE_AUDIO_TO,
+                translate_process
+            ], {
+                "result_diarize": self.result_diarize
+            }):
+                prog_disp("Translating...", 0.70, is_gui, progress=progress)
+                lang_source = (
+                    self.align_language
+                    if self.align_language
+                    else SOURCE_LANGUAGE
+                )
+                self.result_diarize["segments"] = translate_text(
+                    self.result_diarize["segments"],
+                    TRANSLATE_AUDIO_TO,
+                    translate_process,
+                    chunk_size=1800,
+                    source=lang_source,
+                )
+                logger.debug("Translation complete")
+                logger.debug(self.result_diarize)
+
+        if get_translated_text:
+
+            json_data = []
+            for segment in self.result_diarize["segments"]:
+                start = segment["start"]
+                text = segment["text"]
+                speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1
+                json_data.append(
+                    {"start": start, "text": text, "speaker": speaker}
+                )
+
+            # Convert list of dictionaries to a JSON string with indentation
+            json_string = json.dumps(json_data, indent=2)
+            logger.info("Done")
+            self.edit_subs_complete = True
+            return json_string.encode().decode("unicode_escape")
+
+        if get_video_from_text_json:
+
+            if self.result_diarize is None:
+                raise ValueError("Generate the transcription first.")
+            # with open('text_json.json', 'r') as file:
+            text_json_loaded = json.loads(text_json)
+            for i, segment in enumerate(self.result_diarize["segments"]):
+                segment["text"] = text_json_loaded[i]["text"]
+                segment["speaker"] = "SPEAKER_{:02d}".format(
+                    int(text_json_loaded[i]["speaker"]) - 1
+                )
+
+        # Write subtitle
+        if not self.task_in_cache("subs_and_edit", [
+            copy.deepcopy(self.result_diarize),
+            output_format_subtitle,
+            TRANSLATE_AUDIO_TO
+        ], {
+            "result_diarize": self.result_diarize
+        }):
+            if output_format_subtitle == "disable":
+                self.sub_file = "sub_tra.srt"
+            elif output_format_subtitle != "ass":
+                self.sub_file = process_subtitles(
+                    self.result_source_lang,
+                    self.align_language,
+                    self.result_diarize,
+                    output_format_subtitle,
+                    TRANSLATE_AUDIO_TO,
+                )
+
+            # Need task
+            if output_format_subtitle != "srt":
+                _ = process_subtitles(
+                    self.result_source_lang,
+                    self.align_language,
+                    self.result_diarize,
+                    "srt",
+                    TRANSLATE_AUDIO_TO,
+                )
+
+            if output_format_subtitle == "ass":
+                convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y"
+                convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y"
+                self.sub_file = "sub_tra.ass"
+                run_command(convert_ori)
+                run_command(convert_tra)
+
+        format_sub = (
+            output_format_subtitle
+            if output_format_subtitle != "disable"
+            else "srt"
+        )
+
+        if output_type == "subtitle":
+
+            out_subs = []
+            tra_subs = media_out(
+                media_file,
+                TRANSLATE_AUDIO_TO,
+                video_output_name,
+                format_sub,
+                file_obj=self.sub_file,
+            )
+            out_subs.append(tra_subs)
+
+            ori_subs = media_out(
+                media_file,
+                self.align_language,
+                video_output_name,
+                format_sub,
+                file_obj=f"sub_ori.{format_sub}",
+            )
+            out_subs.append(ori_subs)
+            logger.info(f"Done: {out_subs}")
+            return out_subs
+
+        if output_type == "subtitle [by speaker]":
+            output = get_subtitle_speaker(
+                media_file,
+                result=self.result_diarize,
+                language=TRANSLATE_AUDIO_TO,
+                extension=format_sub,
+                base_name=video_output_name,
+            )
+            logger.info(f"Done: {str(output)}")
+            return output
+
+        if "video [subtitled]" in output_type:
+            output = media_out(
+                media_file,
+                TRANSLATE_AUDIO_TO + "_subtitled",
+                video_output_name,
+                "wav" if is_audio_file(media_file) else (
+                    "mkv" if "mkv" in output_type else "mp4"
+                ),
+                file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
+                soft_subtitles=False if is_audio_file(media_file) else True,
+                subtitle_files=output_format_subtitle,
+            )
+            msg_out = output[0] if isinstance(output, list) else output
+            logger.info(f"Done: {msg_out}")
+            return output
+
+        if not self.task_in_cache("tts", [
+            TRANSLATE_AUDIO_TO,
+            tts_voice00,
+            tts_voice01,
+            tts_voice02,
+            tts_voice03,
+            tts_voice04,
+            tts_voice05,
+            tts_voice06,
+            tts_voice07,
+            tts_voice08,
+            tts_voice09,
+            tts_voice10,
+            tts_voice11,
+            dereverb_automatic_xtts
+        ], {
+            "sub_file": self.sub_file
+        }):
+            prog_disp("Text to speech...", 0.80, is_gui, progress=progress)
+            self.valid_speakers = audio_segmentation_to_voice(
+                self.result_diarize,
+                TRANSLATE_AUDIO_TO,
+                is_gui,
+                tts_voice00,
+                tts_voice01,
+                tts_voice02,
+                tts_voice03,
+                tts_voice04,
+                tts_voice05,
+                tts_voice06,
+                tts_voice07,
+                tts_voice08,
+                tts_voice09,
+                tts_voice10,
+                tts_voice11,
+                dereverb_automatic_xtts,
+            )
+
+        if not self.task_in_cache("acc_and_vc", [
+            max_accelerate_audio,
+            acceleration_rate_regulation,
+            voice_imitation,
+            voice_imitation_max_segments,
+            voice_imitation_remove_previous,
+            voice_imitation_vocals_dereverb,
+            voice_imitation_method,
+            custom_voices,
+            custom_voices_workers,
+            copy.deepcopy(self.vci.model_config),
+            avoid_overlap
+        ], {
+            "valid_speakers": self.valid_speakers
+        }):
+            audio_files, speakers_list = accelerate_segments(
+                    self.result_diarize,
+                    max_accelerate_audio,
+                    self.valid_speakers,
+                    acceleration_rate_regulation,
+                )
+
+            # Voice Imitation (Tone color converter)
+            if voice_imitation:
+                prog_disp(
+                    "Voice Imitation...", 0.85, is_gui, progress=progress
+                )
+                from soni_translate.text_to_speech import toneconverter
+
+                try:
+                    toneconverter(
+                        copy.deepcopy(self.result_diarize),
+                        voice_imitation_max_segments,
+                        voice_imitation_remove_previous,
+                        voice_imitation_vocals_dereverb,
+                        voice_imitation_method,
+                    )
+                except Exception as error:
+                    logger.error(str(error))
+
+            # custom voice
+            if custom_voices:
+                prog_disp(
+                    "Applying customized voices...",
+                    0.90,
+                    is_gui,
+                    progress=progress,
+                )
+
+                try:
+                    self.vci(
+                        audio_files,
+                        speakers_list,
+                        overwrite=True,
+                        parallel_workers=custom_voices_workers,
+                    )
+                    self.vci.unload_models()
+                except Exception as error:
+                    logger.error(str(error))
+
+            prog_disp(
+                "Creating final translated video...",
+                0.95,
+                is_gui,
+                progress=progress,
+            )
+            remove_files(dub_audio_file)
+            create_translated_audio(
+                self.result_diarize,
+                audio_files,
+                dub_audio_file,
+                False,
+                avoid_overlap,
+            )
+
+        # Voiceless track, change with file
+        hash_base_audio_wav = get_hash(base_audio_wav)
+        if voiceless_track:
+            if self.voiceless_id != hash_base_audio_wav:
+                from soni_translate.mdx_net import process_uvr_task
+
+                try:
+                    # voiceless_audio_file_dir = "clean_song_output/voiceless"
+                    remove_files(voiceless_audio_file)
+                    uvr_voiceless_audio_wav, _ = process_uvr_task(
+                        orig_song_path=base_audio_wav,
+                        song_id="voiceless",
+                        only_voiceless=True,
+                        remove_files_output_dir=False,
+                    )
+                    copy_files(uvr_voiceless_audio_wav, ".")
+                    base_audio_wav = voiceless_audio_file
+                    self.voiceless_id = hash_base_audio_wav
+
+                except Exception as error:
+                    logger.error(str(error))
+            else:
+                base_audio_wav = voiceless_audio_file
+
+        if not self.task_in_cache("mix_aud", [
+            mix_method_audio,
+            volume_original_audio,
+            volume_translated_audio,
+            voiceless_track
+        ], {}):
+            # TYPE MIX AUDIO
+            remove_files(mix_audio_file)
+            command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}'
+            command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}'
+            if mix_method_audio == "Adjusting volumes and mixing audio":
+                # volume mix
+                run_command(command_volume_mix)
+            else:
+                try:
+                    # background mix
+                    run_command(command_background_mix)
+                except Exception as error_mix:
+                    # volume mix except
+                    logger.error(str(error_mix))
+                    run_command(command_volume_mix)
+
+        if "audio" in output_type or is_audio_file(media_file):
+            output = media_out(
+                media_file,
+                TRANSLATE_AUDIO_TO,
+                video_output_name,
+                "wav" if "wav" in output_type else (
+                    "ogg" if "ogg" in output_type else "mp3"
+                ),
+                file_obj=mix_audio_file,
+                subtitle_files=output_format_subtitle,
+            )
+            msg_out = output[0] if isinstance(output, list) else output
+            logger.info(f"Done: {msg_out}")
+            return output
+
+        hash_base_video_file = get_hash(base_video_file)
+
+        if burn_subtitles_to_video:
+            hashvideo_text = [
+                hash_base_video_file,
+                [seg["text"] for seg in self.result_diarize["segments"]]
+            ]
+            if self.burn_subs_id != hashvideo_text:
+                try:
+                    logger.info("Burn subtitles")
+                    remove_files(vid_subs)
+                    command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}"
+                    run_command(command)
+                    base_video_file = vid_subs
+                    self.burn_subs_id = hashvideo_text
+                except Exception as error:
+                    logger.error(str(error))
+            else:
+                base_video_file = vid_subs
+
+        if not self.task_in_cache("output", [
+            hash_base_video_file,
+            hash_base_audio_wav,
+            burn_subtitles_to_video
+        ], {}):
+            # Merge new audio + video
+            remove_files(video_output_file)
+            run_command(
+                f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}"
+            )
+
+        output = media_out(
+            media_file,
+            TRANSLATE_AUDIO_TO,
+            video_output_name,
+            "mkv" if "mkv" in output_type else "mp4",
+            file_obj=video_output_file,
+            soft_subtitles=soft_subtitles_to_video,
+            subtitle_files=output_format_subtitle,
+        )
+        msg_out = output[0] if isinstance(output, list) else output
+        logger.info(f"Done: {msg_out}")
+
+        return output
+
+    def hook_beta_processor(
+        self,
+        document,
+        tgt_lang,
+        translate_process,
+        ori_lang,
+        tts,
+        name_final_file,
+        custom_voices,
+        custom_voices_workers,
+        output_type,
+        chunk_size,
+        width,
+        height,
+        start_page,
+        end_page,
+        bcolor,
+        is_gui,
+        progress
+    ):
+        prog_disp("Processing pages...", 0.10, is_gui, progress=progress)
+        doc_data = doc_to_txtximg_pages(document,  width, height, start_page, end_page, bcolor)
+        result_diarize = page_data_to_segments(doc_data, 1700)
+
+        prog_disp("Translating...", 0.20, is_gui, progress=progress)
+        result_diarize["segments"] = translate_text(
+            result_diarize["segments"],
+            tgt_lang,
+            translate_process,
+            chunk_size=0,
+            source=ori_lang,
+        )
+        chunk_size = (
+            chunk_size if chunk_size else determine_chunk_size(tts)
+        )
+        doc_data = update_page_data(result_diarize, doc_data)
+
+        prog_disp("Text to speech...", 0.30, is_gui, progress=progress)
+        result_diarize = page_data_to_segments(doc_data, chunk_size)
+        valid_speakers = audio_segmentation_to_voice(
+            result_diarize,
+            tgt_lang,
+            is_gui,
+            tts,
+        )
+
+        # fix format and set folder output
+        audio_files, speakers_list = accelerate_segments(
+                result_diarize,
+                1.0,
+                valid_speakers,
+            )
+
+        # custom voice
+        if custom_voices:
+            prog_disp(
+                "Applying customized voices...",
+                0.60,
+                is_gui,
+                progress=progress,
+            )
+            self.vci(
+                audio_files,
+                speakers_list,
+                overwrite=True,
+                parallel_workers=custom_voices_workers,
+            )
+            self.vci.unload_models()
+
+        # Update time segments and not concat
+        result_diarize = fix_timestamps_docs(result_diarize, audio_files)
+        final_wav_file = "audio_book.wav"
+        remove_files(final_wav_file)
+
+        prog_disp("Creating audio file...", 0.70, is_gui, progress=progress)
+        create_translated_audio(
+            result_diarize, audio_files, final_wav_file, False
+        )
+
+        prog_disp("Creating video file...", 0.80, is_gui, progress=progress)
+        video_doc = create_video_from_images(
+                doc_data,
+                result_diarize
+        )
+
+        # Merge video and audio
+        prog_disp("Merging...", 0.90, is_gui, progress=progress)
+        vid_out = merge_video_and_audio(video_doc, final_wav_file)
+
+        # End
+        output = media_out(
+            document,
+            tgt_lang,
+            name_final_file,
+            "mkv" if "mkv" in output_type else "mp4",
+            file_obj=vid_out,
+        )
+        logger.info(f"Done: {output}")
+        return output
+
+    def multilingual_docs_conversion(
+        self,
+        string_text="",  # string
+        document=None,  # doc path gui
+        directory_input="",  # doc path
+        origin_language="English (en)",
+        target_language="English (en)",
+        tts_voice00="en-US-EmmaMultilingualNeural-Female",
+        name_final_file="",
+        translate_process="google_translator",
+        output_type="audio",
+        chunk_size=None,
+        custom_voices=False,
+        custom_voices_workers=1,
+        start_page=1,
+        end_page=99999,
+        width=1280,
+        height=720,
+        bcolor="dynamic",
+        is_gui=False,
+        progress=gr.Progress(),
+    ):
+        if "gpt" in translate_process:
+            check_openai_api_key()
+
+        SOURCE_LANGUAGE = LANGUAGES[origin_language]
+        if translate_process != "disable_translation":
+            TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
+        else:
+            TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE
+            logger.info("No translation")
+        if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
+            logger.debug(
+                "Make sure to select a 'TTS Speaker' suitable for the "
+                "translation language to avoid errors with the TTS."
+            )
+
+        self.clear_cache(string_text, force=True)
+
+        is_string = False
+        if document is None:
+            if os.path.exists(directory_input):
+                document = directory_input
+            else:
+                document = string_text
+                is_string = True
+        document = document if isinstance(document, str) else document.name
+        if not document:
+            raise Exception("No data found")
+
+        if "videobook" in output_type:
+            if not document.lower().endswith(".pdf"):
+                raise ValueError(
+                    "Videobooks are only compatible with PDF files."
+                )
+
+            return self.hook_beta_processor(
+                document,
+                TRANSLATE_AUDIO_TO,
+                translate_process,
+                SOURCE_LANGUAGE,
+                tts_voice00,
+                name_final_file,
+                custom_voices,
+                custom_voices_workers,
+                output_type,
+                chunk_size,
+                width,
+                height,
+                start_page,
+                end_page,
+                bcolor,
+                is_gui,
+                progress
+            )
+
+        # audio_wav = "audio.wav"
+        final_wav_file = "audio_book.wav"
+
+        prog_disp("Processing text...", 0.15, is_gui, progress=progress)
+        result_file_path, result_text = document_preprocessor(
+            document, is_string, start_page, end_page
+        )
+
+        if (
+            output_type == "book (txt)"
+            and translate_process == "disable_translation"
+        ):
+            return result_file_path
+
+        if "SET_LIMIT" == os.getenv("DEMO"):
+            result_text = result_text[:50]
+            logger.info(
+                "DEMO; Generation is limited to 50 characters to prevent "
+                "CPU errors. No limitations with GPU.\n"
+            )
+
+        if translate_process != "disable_translation":
+            # chunks text for translation
+            result_diarize = plain_text_to_segments(result_text, 1700)
+            prog_disp("Translating...", 0.30, is_gui, progress=progress)
+            # not or iterative with 1700 chars
+            result_diarize["segments"] = translate_text(
+                result_diarize["segments"],
+                TRANSLATE_AUDIO_TO,
+                translate_process,
+                chunk_size=0,
+                source=SOURCE_LANGUAGE,
+            )
+
+            txt_file_path, result_text = segments_to_plain_text(result_diarize)
+
+            if output_type == "book (txt)":
+                return media_out(
+                    result_file_path if is_string else document,
+                    TRANSLATE_AUDIO_TO,
+                    name_final_file,
+                    "txt",
+                    file_obj=txt_file_path,
+                )
+
+        # (TTS limits) plain text to result_diarize
+        chunk_size = (
+            chunk_size if chunk_size else determine_chunk_size(tts_voice00)
+        )
+        result_diarize = plain_text_to_segments(result_text, chunk_size)
+        logger.debug(result_diarize)
+
+        prog_disp("Text to speech...", 0.45, is_gui, progress=progress)
+        valid_speakers = audio_segmentation_to_voice(
+            result_diarize,
+            TRANSLATE_AUDIO_TO,
+            is_gui,
+            tts_voice00,
+        )
+
+        # fix format and set folder output
+        audio_files, speakers_list = accelerate_segments(
+                result_diarize,
+                1.0,
+                valid_speakers,
+            )
+
+        # custom voice
+        if custom_voices:
+            prog_disp(
+                "Applying customized voices...",
+                0.80,
+                is_gui,
+                progress=progress,
+            )
+            self.vci(
+                audio_files,
+                speakers_list,
+                overwrite=True,
+                parallel_workers=custom_voices_workers,
+            )
+            self.vci.unload_models()
+
+        prog_disp(
+            "Creating final audio file...", 0.90, is_gui, progress=progress
+        )
+        remove_files(final_wav_file)
+        create_translated_audio(
+            result_diarize, audio_files, final_wav_file, True
+        )
+
+        output = media_out(
+            result_file_path if is_string else document,
+            TRANSLATE_AUDIO_TO,
+            name_final_file,
+            "mp3" if "mp3" in output_type else (
+                "ogg" if "ogg" in output_type else "wav"
+            ),
+            file_obj=final_wav_file,
+        )
+
+        logger.info(f"Done: {output}")
+
+        return output
+
+
+title = "<center><strong><font size='7'>📽️ SoniTranslate 🈷️</font></strong></center>"
+
+
+def create_gui(theme, logs_in_gui=False):
+    with gr.Blocks(theme=theme) as app:
+        gr.Markdown(title)
+        gr.Markdown(lg_conf["description"])
+
+        with gr.Tab(lg_conf["tab_translate"]):
+            with gr.Row():
+                with gr.Column():
+                    input_data_type = gr.Dropdown(
+                        ["SUBMIT VIDEO", "URL", "Find Video Path"],
+                        value="SUBMIT VIDEO",
+                        label=lg_conf["video_source"],
+                    )
+
+                    def swap_visibility(data_type):
+                        if data_type == "URL":
+                            return (
+                                gr.update(visible=False, value=None),
+                                gr.update(visible=True, value=""),
+                                gr.update(visible=False, value=""),
+                            )
+                        elif data_type == "SUBMIT VIDEO":
+                            return (
+                                gr.update(visible=True, value=None),
+                                gr.update(visible=False, value=""),
+                                gr.update(visible=False, value=""),
+                            )
+                        elif data_type == "Find Video Path":
+                            return (
+                                gr.update(visible=False, value=None),
+                                gr.update(visible=False, value=""),
+                                gr.update(visible=True, value=""),
+                            )
+
+                    video_input = gr.File(
+                        label="VIDEO",
+                        file_count="multiple",
+                        type="filepath",
+                    )
+                    blink_input = gr.Textbox(
+                        visible=False,
+                        label=lg_conf["link_label"],
+                        info=lg_conf["link_info"],
+                        placeholder=lg_conf["link_ph"],
+                    )
+                    directory_input = gr.Textbox(
+                        visible=False,
+                        label=lg_conf["dir_label"],
+                        info=lg_conf["dir_info"],
+                        placeholder=lg_conf["dir_ph"],
+                    )
+                    input_data_type.change(
+                        fn=swap_visibility,
+                        inputs=input_data_type,
+                        outputs=[video_input, blink_input, directory_input],
+                    )
+
+                    gr.HTML()
+
+                    SOURCE_LANGUAGE = gr.Dropdown(
+                        LANGUAGES_LIST,
+                        value=LANGUAGES_LIST[0],
+                        label=lg_conf["sl_label"],
+                        info=lg_conf["sl_info"],
+                    )
+                    TRANSLATE_AUDIO_TO = gr.Dropdown(
+                        LANGUAGES_LIST[1:],
+                        value="English (en)",
+                        label=lg_conf["tat_label"],
+                        info=lg_conf["tat_info"],
+                    )
+
+                    gr.HTML("<hr></h2>")
+
+                    gr.Markdown(lg_conf["num_speakers"])
+                    MAX_TTS = 12
+                    min_speakers = gr.Slider(
+                        1,
+                        MAX_TTS,
+                        value=1,
+                        label=lg_conf["min_sk"],
+                        step=1,
+                        visible=False,
+                    )
+                    max_speakers = gr.Slider(
+                        1,
+                        MAX_TTS,
+                        value=2,
+                        step=1,
+                        label=lg_conf["max_sk"],
+                    )
+                    gr.Markdown(lg_conf["tts_select"])
+
+                    def submit(value):
+                        visibility_dict = {
+                            f"tts_voice{i:02d}": gr.update(visible=i < value)
+                            for i in range(MAX_TTS)
+                        }
+                        return [value for value in visibility_dict.values()]
+
+                    tts_voice00 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="en-US-EmmaMultilingualNeural-Female",
+                        label=lg_conf["sk1"],
+                        visible=True,
+                        interactive=True,
+                    )
+                    tts_voice01 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="en-US-AndrewMultilingualNeural-Male",
+                        label=lg_conf["sk2"],
+                        visible=True,
+                        interactive=True,
+                    )
+                    tts_voice02 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="en-US-AvaMultilingualNeural-Female",
+                        label=lg_conf["sk3"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice03 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="en-US-BrianMultilingualNeural-Male",
+                        label=lg_conf["sk4"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice04 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="de-DE-SeraphinaMultilingualNeural-Female",
+                        label=lg_conf["sk4"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice05 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="de-DE-FlorianMultilingualNeural-Male",
+                        label=lg_conf["sk6"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice06 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="fr-FR-VivienneMultilingualNeural-Female",
+                        label=lg_conf["sk7"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice07 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="fr-FR-RemyMultilingualNeural-Male",
+                        label=lg_conf["sk8"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice08 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="en-US-EmmaMultilingualNeural-Female",
+                        label=lg_conf["sk9"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice09 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="en-US-AndrewMultilingualNeural-Male",
+                        label=lg_conf["sk10"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice10 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="en-US-EmmaMultilingualNeural-Female",
+                        label=lg_conf["sk11"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    tts_voice11 = gr.Dropdown(
+                        SoniTr.tts_info.tts_list(),
+                        value="en-US-AndrewMultilingualNeural-Male",
+                        label=lg_conf["sk12"],
+                        visible=False,
+                        interactive=True,
+                    )
+                    max_speakers.change(
+                        submit,
+                        max_speakers,
+                        [
+                            tts_voice00,
+                            tts_voice01,
+                            tts_voice02,
+                            tts_voice03,
+                            tts_voice04,
+                            tts_voice05,
+                            tts_voice06,
+                            tts_voice07,
+                            tts_voice08,
+                            tts_voice09,
+                            tts_voice10,
+                            tts_voice11,
+                        ],
+                    )
+
+                    with gr.Column():
+                        with gr.Accordion(
+                            lg_conf["vc_title"],
+                            open=False,
+                        ):
+                            gr.Markdown(lg_conf["vc_subtitle"])
+                            voice_imitation_gui = gr.Checkbox(
+                                False,
+                                label=lg_conf["vc_active_label"],
+                                info=lg_conf["vc_active_info"],
+                            )
+                            openvoice_models = ["openvoice", "openvoice_v2"]
+                            voice_imitation_method_options = (
+                                ["freevc"] + openvoice_models
+                                if SoniTr.tts_info.xtts_enabled
+                                else openvoice_models
+                            )
+                            voice_imitation_method_gui = gr.Dropdown(
+                                voice_imitation_method_options,
+                                value=voice_imitation_method_options[0],
+                                label=lg_conf["vc_method_label"],
+                                info=lg_conf["vc_method_info"],
+                            )
+                            voice_imitation_max_segments_gui = gr.Slider(
+                                label=lg_conf["vc_segments_label"],
+                                info=lg_conf["vc_segments_info"],
+                                value=3,
+                                step=1,
+                                minimum=1,
+                                maximum=10,
+                                visible=True,
+                                interactive=True,
+                            )
+                            voice_imitation_vocals_dereverb_gui = gr.Checkbox(
+                                False,
+                                label=lg_conf["vc_dereverb_label"],
+                                info=lg_conf["vc_dereverb_info"],
+                            )
+                            voice_imitation_remove_previous_gui = gr.Checkbox(
+                                True,
+                                label=lg_conf["vc_remove_label"],
+                                info=lg_conf["vc_remove_info"],
+                            )
+
+                    if SoniTr.tts_info.xtts_enabled:
+                        with gr.Column():
+                            with gr.Accordion(
+                                lg_conf["xtts_title"],
+                                open=False,
+                            ):
+                                gr.Markdown(lg_conf["xtts_subtitle"])
+                                wav_speaker_file = gr.File(
+                                    label=lg_conf["xtts_file_label"]
+                                )
+                                wav_speaker_name = gr.Textbox(
+                                    label=lg_conf["xtts_name_label"],
+                                    value="",
+                                    info=lg_conf["xtts_name_info"],
+                                    placeholder="default_name",
+                                    lines=1,
+                                )
+                                wav_speaker_start = gr.Number(
+                                    label="Time audio start",
+                                    value=0,
+                                    visible=False,
+                                )
+                                wav_speaker_end = gr.Number(
+                                    label="Time audio end",
+                                    value=0,
+                                    visible=False,
+                                )
+                                wav_speaker_dir = gr.Textbox(
+                                    label="Directory save",
+                                    value="_XTTS_",
+                                    visible=False,
+                                )
+                                wav_speaker_dereverb = gr.Checkbox(
+                                    True,
+                                    label=lg_conf["xtts_dereverb_label"],
+                                    info=lg_conf["xtts_dereverb_info"]
+                                )
+                                wav_speaker_output = gr.HTML()
+                                create_xtts_wav = gr.Button(
+                                    lg_conf["xtts_button"]
+                                )
+                                gr.Markdown(lg_conf["xtts_footer"])
+                    else:
+                        wav_speaker_dereverb = gr.Checkbox(
+                            False,
+                            label=lg_conf["xtts_dereverb_label"],
+                            info=lg_conf["xtts_dereverb_info"],
+                            visible=False
+                        )
+
+                    with gr.Column():
+                        with gr.Accordion(
+                            lg_conf["extra_setting"], open=False
+                        ):
+                            audio_accelerate = gr.Slider(
+                                label=lg_conf["acc_max_label"],
+                                value=1.9,
+                                step=0.1,
+                                minimum=1.0,
+                                maximum=2.5,
+                                visible=True,
+                                interactive=True,
+                                info=lg_conf["acc_max_info"],
+                            )
+                            acceleration_rate_regulation_gui = gr.Checkbox(
+                                False,
+                                label=lg_conf["acc_rate_label"],
+                                info=lg_conf["acc_rate_info"],
+                            )
+                            avoid_overlap_gui = gr.Checkbox(
+                                False,
+                                label=lg_conf["or_label"],
+                                info=lg_conf["or_info"],
+                            )
+
+                            gr.HTML("<hr></h2>")
+
+                            audio_mix_options = [
+                                "Mixing audio with sidechain compression",
+                                "Adjusting volumes and mixing audio",
+                            ]
+                            AUDIO_MIX = gr.Dropdown(
+                                audio_mix_options,
+                                value=audio_mix_options[1],
+                                label=lg_conf["aud_mix_label"],
+                                info=lg_conf["aud_mix_info"],
+                            )
+                            volume_original_mix = gr.Slider(
+                                label=lg_conf["vol_ori"],
+                                info="for Adjusting volumes and mixing audio",
+                                value=0.25,
+                                step=0.05,
+                                minimum=0.0,
+                                maximum=2.50,
+                                visible=True,
+                                interactive=True,
+                            )
+                            volume_translated_mix = gr.Slider(
+                                label=lg_conf["vol_tra"],
+                                info="for Adjusting volumes and mixing audio",
+                                value=1.80,
+                                step=0.05,
+                                minimum=0.0,
+                                maximum=2.50,
+                                visible=True,
+                                interactive=True,
+                            )
+                            main_voiceless_track = gr.Checkbox(
+                                label=lg_conf["voiceless_tk_label"],
+                                info=lg_conf["voiceless_tk_info"],
+                            )
+
+                            gr.HTML("<hr></h2>")
+                            sub_type_options = [
+                                "disable",
+                                "srt",
+                                "vtt",
+                                "ass",
+                                "txt",
+                                "tsv",
+                                "json",
+                                "aud",
+                            ]
+
+                            sub_type_output = gr.Dropdown(
+                                sub_type_options,
+                                value=sub_type_options[1],
+                                label=lg_conf["sub_type"],
+                            )
+                            soft_subtitles_to_video_gui = gr.Checkbox(
+                                label=lg_conf["soft_subs_label"],
+                                info=lg_conf["soft_subs_info"],
+                            )
+                            burn_subtitles_to_video_gui = gr.Checkbox(
+                                label=lg_conf["burn_subs_label"],
+                                info=lg_conf["burn_subs_info"],
+                            )
+
+                            gr.HTML("<hr></h2>")
+                            gr.Markdown(lg_conf["whisper_title"])
+                            literalize_numbers_gui = gr.Checkbox(
+                                True,
+                                label=lg_conf["lnum_label"],
+                                info=lg_conf["lnum_info"],
+                            )
+                            vocal_refinement_gui = gr.Checkbox(
+                                False,
+                                label=lg_conf["scle_label"],
+                                info=lg_conf["scle_info"],
+                            )
+                            segment_duration_limit_gui = gr.Slider(
+                                label=lg_conf["sd_limit_label"],
+                                info=lg_conf["sd_limit_info"],
+                                value=15,
+                                step=1,
+                                minimum=1,
+                                maximum=30,
+                            )
+                            whisper_model_default = (
+                                "large-v3"
+                                if SoniTr.device == "cuda"
+                                else "medium"
+                            )
+
+                            WHISPER_MODEL_SIZE = gr.Dropdown(
+                                ASR_MODEL_OPTIONS + find_whisper_models(),
+                                value=whisper_model_default,
+                                label="Whisper ASR model",
+                                info=lg_conf["asr_model_info"],
+                                allow_custom_value=True,
+                            )
+                            com_t_opt, com_t_default = (
+                                [COMPUTE_TYPE_GPU, "float16"]
+                                if SoniTr.device == "cuda"
+                                else [COMPUTE_TYPE_CPU, "float32"]
+                            )
+                            compute_type = gr.Dropdown(
+                                com_t_opt,
+                                value=com_t_default,
+                                label=lg_conf["ctype_label"],
+                                info=lg_conf["ctype_info"],
+                            )
+                            batch_size = gr.Slider(
+                                minimum=1,
+                                maximum=32,
+                                value=8,
+                                label=lg_conf["batchz_label"],
+                                info=lg_conf["batchz_info"],
+                                step=1,
+                            )
+                            input_srt = gr.File(
+                                label=lg_conf["srt_file_label"],
+                                file_types=[".srt", ".ass", ".vtt"],
+                                height=130,
+                            )
+
+                            gr.HTML("<hr></h2>")
+                            text_segmentation_options = [
+                                "sentence",
+                                "word",
+                                "character"
+                            ]
+                            text_segmentation_scale_gui = gr.Dropdown(
+                                text_segmentation_options,
+                                value=text_segmentation_options[0],
+                                label=lg_conf["tsscale_label"],
+                                info=lg_conf["tsscale_info"],
+                            )
+                            divide_text_segments_by_gui = gr.Textbox(
+                                label=lg_conf["divide_text_label"],
+                                value="",
+                                info=lg_conf["divide_text_info"],
+                            )
+
+                            gr.HTML("<hr></h2>")
+                            pyannote_models_list = list(
+                                diarization_models.keys()
+                            )
+                            diarization_process_dropdown = gr.Dropdown(
+                                pyannote_models_list,
+                                value=pyannote_models_list[1],
+                                label=lg_conf["diarization_label"],
+                            )
+                            translate_process_dropdown = gr.Dropdown(
+                                TRANSLATION_PROCESS_OPTIONS,
+                                value=TRANSLATION_PROCESS_OPTIONS[0],
+                                label=lg_conf["tr_process_label"],
+                            )
+
+                            gr.HTML("<hr></h2>")
+                            main_output_type = gr.Dropdown(
+                                OUTPUT_TYPE_OPTIONS,
+                                value=OUTPUT_TYPE_OPTIONS[0],
+                                label=lg_conf["out_type_label"],
+                            )
+                            VIDEO_OUTPUT_NAME = gr.Textbox(
+                                label=lg_conf["out_name_label"],
+                                value="",
+                                info=lg_conf["out_name_info"],
+                            )
+                            play_sound_gui = gr.Checkbox(
+                                True,
+                                label=lg_conf["task_sound_label"],
+                                info=lg_conf["task_sound_info"],
+                            )
+                            enable_cache_gui = gr.Checkbox(
+                                True,
+                                label=lg_conf["cache_label"],
+                                info=lg_conf["cache_info"],
+                            )
+                            PREVIEW = gr.Checkbox(
+                                label="Preview", info=lg_conf["preview_info"]
+                            )
+                            is_gui_dummy_check = gr.Checkbox(
+                                True, visible=False
+                            )
+
+                with gr.Column(variant="compact"):
+                    edit_sub_check = gr.Checkbox(
+                        label=lg_conf["edit_sub_label"],
+                        info=lg_conf["edit_sub_info"],
+                    )
+                    dummy_false_check = gr.Checkbox(
+                        False,
+                        visible=False,
+                    )
+
+                    def visible_component_subs(input_bool):
+                        if input_bool:
+                            return gr.update(visible=True), gr.update(
+                                visible=True
+                            )
+                        else:
+                            return gr.update(visible=False), gr.update(
+                                visible=False
+                            )
+
+                    subs_button = gr.Button(
+                        lg_conf["button_subs"],
+                        variant="primary",
+                        visible=False,
+                    )
+                    subs_edit_space = gr.Textbox(
+                        visible=False,
+                        lines=10,
+                        label=lg_conf["editor_sub_label"],
+                        info=lg_conf["editor_sub_info"],
+                        placeholder=lg_conf["editor_sub_ph"],
+                    )
+                    edit_sub_check.change(
+                        visible_component_subs,
+                        [edit_sub_check],
+                        [subs_button, subs_edit_space],
+                    )
+
+                    with gr.Row():
+                        video_button = gr.Button(
+                            lg_conf["button_translate"],
+                            variant="primary",
+                        )
+                    with gr.Row():
+                        video_output = gr.File(
+                            label=lg_conf["output_result_label"],
+                            file_count="multiple",
+                            interactive=False,
+
+                        )  # gr.Video()
+
+                    gr.HTML("<hr></h2>")
+
+                    if (
+                        os.getenv("YOUR_HF_TOKEN") is None
+                        or os.getenv("YOUR_HF_TOKEN") == ""
+                    ):
+                        HFKEY = gr.Textbox(
+                            visible=True,
+                            label="HF Token",
+                            info=lg_conf["ht_token_info"],
+                            placeholder=lg_conf["ht_token_ph"],
+                        )
+                    else:
+                        HFKEY = gr.Textbox(
+                            visible=False,
+                            label="HF Token",
+                            info=lg_conf["ht_token_info"],
+                            placeholder=lg_conf["ht_token_ph"],
+                        )
+
+                    gr.Examples(
+                        examples=[
+                            [
+                                ["./assets/Video_main.mp4"],
+                                "",
+                                "",
+                                "",
+                                False,
+                                whisper_model_default,
+                                4,
+                                com_t_default,
+                                "Spanish (es)",
+                                "English (en)",
+                                1,
+                                2,
+                                "en-CA-ClaraNeural-Female",
+                                "en-AU-WilliamNeural-Male",
+                            ],
+                        ],  # no update
+                        fn=SoniTr.batch_multilingual_media_conversion,
+                        inputs=[
+                            video_input,
+                            blink_input,
+                            directory_input,
+                            HFKEY,
+                            PREVIEW,
+                            WHISPER_MODEL_SIZE,
+                            batch_size,
+                            compute_type,
+                            SOURCE_LANGUAGE,
+                            TRANSLATE_AUDIO_TO,
+                            min_speakers,
+                            max_speakers,
+                            tts_voice00,
+                            tts_voice01,
+                        ],
+                        outputs=[video_output],
+                        cache_examples=False,
+                    )
+
+        with gr.Tab(lg_conf["tab_docs"]):
+            with gr.Column():
+                with gr.Accordion("Docs", open=True):
+                    with gr.Column(variant="compact"):
+                        with gr.Column():
+                            input_doc_type = gr.Dropdown(
+                                [
+                                    "WRITE TEXT",
+                                    "SUBMIT DOCUMENT",
+                                    "Find Document Path",
+                                ],
+                                value="SUBMIT DOCUMENT",
+                                label=lg_conf["docs_input_label"],
+                                info=lg_conf["docs_input_info"],
+                            )
+
+                            def swap_visibility(data_type):
+                                if data_type == "WRITE TEXT":
+                                    return (
+                                        gr.update(visible=True, value=""),
+                                        gr.update(visible=False, value=None),
+                                        gr.update(visible=False, value=""),
+                                    )
+                                elif data_type == "SUBMIT DOCUMENT":
+                                    return (
+                                        gr.update(visible=False, value=""),
+                                        gr.update(visible=True, value=None),
+                                        gr.update(visible=False, value=""),
+                                    )
+                                elif data_type == "Find Document Path":
+                                    return (
+                                        gr.update(visible=False, value=""),
+                                        gr.update(visible=False, value=None),
+                                        gr.update(visible=True, value=""),
+                                    )
+
+                            text_docs = gr.Textbox(
+                                label="Text",
+                                value="This is an example",
+                                info="Write a text",
+                                placeholder="...",
+                                lines=5,
+                                visible=False,
+                            )
+                            input_docs = gr.File(
+                                label="Document", visible=True
+                            )
+                            directory_input_docs = gr.Textbox(
+                                visible=False,
+                                label="Document Path",
+                                info="Example: /home/my_doc.pdf",
+                                placeholder="Path goes here...",
+                            )
+                            input_doc_type.change(
+                                fn=swap_visibility,
+                                inputs=input_doc_type,
+                                outputs=[
+                                    text_docs,
+                                    input_docs,
+                                    directory_input_docs,
+                                ],
+                            )
+
+                            gr.HTML()
+
+                            tts_documents = gr.Dropdown(
+                                list(
+                                    filter(
+                                        lambda x: x != "_XTTS_/AUTOMATIC.wav",
+                                        SoniTr.tts_info.tts_list(),
+                                    )
+                                ),
+                                value="en-US-EmmaMultilingualNeural-Female",
+                                label="TTS",
+                                visible=True,
+                                interactive=True,
+                            )
+
+                            gr.HTML()
+
+                            docs_SOURCE_LANGUAGE = gr.Dropdown(
+                                LANGUAGES_LIST[1:],
+                                value="English (en)",
+                                label=lg_conf["sl_label"],
+                                info=lg_conf["docs_source_info"],
+                            )
+                            docs_TRANSLATE_TO = gr.Dropdown(
+                                LANGUAGES_LIST[1:],
+                                value="English (en)",
+                                label=lg_conf["tat_label"],
+                                info=lg_conf["tat_info"],
+                            )
+
+                            with gr.Column():
+                                with gr.Accordion(
+                                    lg_conf["extra_setting"], open=False
+                                ):
+                                    docs_translate_process_dropdown = gr.Dropdown(
+                                        DOCS_TRANSLATION_PROCESS_OPTIONS,
+                                        value=DOCS_TRANSLATION_PROCESS_OPTIONS[
+                                            0
+                                        ],
+                                        label="Translation process",
+                                    )
+
+                                    gr.HTML("<hr></h2>")
+
+                                    docs_output_type = gr.Dropdown(
+                                        DOCS_OUTPUT_TYPE_OPTIONS,
+                                        value=DOCS_OUTPUT_TYPE_OPTIONS[2],
+                                        label="Output type",
+                                    )
+                                    docs_OUTPUT_NAME = gr.Textbox(
+                                        label="Final file name",
+                                        value="",
+                                        info=lg_conf["out_name_info"],
+                                    )
+                                    docs_chunk_size = gr.Number(
+                                        label=lg_conf["chunk_size_label"],
+                                        value=0,
+                                        visible=True,
+                                        interactive=True,
+                                        info=lg_conf["chunk_size_info"],
+                                    )
+                                    gr.HTML("<hr></h2>")
+                                    start_page_gui = gr.Number(
+                                        step=1,
+                                        value=1,
+                                        minimum=1,
+                                        maximum=99999,
+                                        label="Start page",
+                                    )
+                                    end_page_gui = gr.Number(
+                                        step=1,
+                                        value=99999,
+                                        minimum=1,
+                                        maximum=99999,
+                                        label="End page",
+                                    )
+                                    gr.HTML("<hr>Videobook config</h2>")
+                                    videobook_width_gui = gr.Number(
+                                        step=1,
+                                        value=1280,
+                                        minimum=100,
+                                        maximum=4096,
+                                        label="Width",
+                                    )
+                                    videobook_height_gui = gr.Number(
+                                        step=1,
+                                        value=720,
+                                        minimum=100,
+                                        maximum=4096,
+                                        label="Height",
+                                    )
+                                    videobook_bcolor_gui = gr.Dropdown(
+                                        BORDER_COLORS,
+                                        value=BORDER_COLORS[0],
+                                        label="Border color",
+                                    )
+                                    docs_dummy_check = gr.Checkbox(
+                                        True, visible=False
+                                    )
+
+                            with gr.Row():
+                                docs_button = gr.Button(
+                                    lg_conf["docs_button"],
+                                    variant="primary",
+                                )
+                            with gr.Row():
+                                docs_output = gr.File(
+                                    label="Result",
+                                    interactive=False,
+                                )
+
+        with gr.Tab("Custom voice R.V.C. (Optional)"):
+
+            with gr.Column():
+                with gr.Accordion("Get the R.V.C. Models", open=True):
+                    url_links = gr.Textbox(
+                        label="URLs",
+                        value="",
+                        info=lg_conf["cv_url_info"],
+                        placeholder="urls here...",
+                        lines=1,
+                    )
+                    download_finish = gr.HTML()
+                    download_button = gr.Button("DOWNLOAD MODELS")
+
+                    def update_models():
+                        models_path, index_path = upload_model_list()
+
+                        dict_models = {
+                            f"fmodel{i:02d}": gr.update(
+                                choices=models_path
+                            )
+                            for i in range(MAX_TTS+1)
+                        }
+                        dict_index = {
+                            f"findex{i:02d}": gr.update(
+                                choices=index_path, value=None
+                            )
+                            for i in range(MAX_TTS+1)
+                        }
+                        dict_changes = {**dict_models, **dict_index}
+                        return [value for value in dict_changes.values()]
+
+            with gr.Column():
+                with gr.Accordion(lg_conf["replace_title"], open=False):
+                    with gr.Column(variant="compact"):
+                        with gr.Column():
+                            gr.Markdown(lg_conf["sec1_title"])
+                            enable_custom_voice = gr.Checkbox(
+                                False,
+                                label="ENABLE",
+                                info=lg_conf["enable_replace"]
+                            )
+                            workers_custom_voice = gr.Number(
+                                step=1,
+                                value=1,
+                                minimum=1,
+                                maximum=50,
+                                label="workers",
+                                visible=False,
+                            )
+
+                            gr.Markdown(lg_conf["sec2_title"])
+                            gr.Markdown(lg_conf["sec2_subtitle"])
+
+                            PITCH_ALGO_OPT = [
+                                "pm",
+                                "harvest",
+                                "crepe",
+                                "rmvpe",
+                                "rmvpe+",
+                            ]
+
+                            def model_conf():
+                                return gr.Dropdown(
+                                    models_path,
+                                    # value="",
+                                    label="Model",
+                                    visible=True,
+                                    interactive=True,
+                                )
+
+                            def pitch_algo_conf():
+                                return gr.Dropdown(
+                                    PITCH_ALGO_OPT,
+                                    value=PITCH_ALGO_OPT[3],
+                                    label="Pitch algorithm",
+                                    visible=True,
+                                    interactive=True,
+                                )
+
+                            def pitch_lvl_conf():
+                                return gr.Slider(
+                                    label="Pitch level",
+                                    minimum=-24,
+                                    maximum=24,
+                                    step=1,
+                                    value=0,
+                                    visible=True,
+                                    interactive=True,
+                                )
+
+                            def index_conf():
+                                return gr.Dropdown(
+                                    index_path,
+                                    value=None,
+                                    label="Index",
+                                    visible=True,
+                                    interactive=True,
+                                )
+
+                            def index_inf_conf():
+                                return gr.Slider(
+                                    minimum=0,
+                                    maximum=1,
+                                    label="Index influence",
+                                    value=0.75,
+                                )
+
+                            def respiration_filter_conf():
+                                return gr.Slider(
+                                    minimum=0,
+                                    maximum=7,
+                                    label="Respiration median filtering",
+                                    value=3,
+                                    step=1,
+                                    interactive=True,
+                                )
+
+                            def envelope_ratio_conf():
+                                return gr.Slider(
+                                    minimum=0,
+                                    maximum=1,
+                                    label="Envelope ratio",
+                                    value=0.25,
+                                    interactive=True,
+                                )
+
+                            def consonant_protec_conf():
+                                return gr.Slider(
+                                    minimum=0,
+                                    maximum=0.5,
+                                    label="Consonant breath protection",
+                                    value=0.5,
+                                    interactive=True,
+                                )
+
+                            def button_conf(tts_name):
+                                return gr.Button(
+                                    lg_conf["cv_button_apply"]+" "+tts_name,
+                                    variant="primary",
+                                )
+
+                            TTS_TABS = [
+                                'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1)
+                            ]
+
+                            CV_SUBTITLES = [
+                                lg_conf["cv_tts1"],
+                                lg_conf["cv_tts2"],
+                                lg_conf["cv_tts3"],
+                                lg_conf["cv_tts4"],
+                                lg_conf["cv_tts5"],
+                                lg_conf["cv_tts6"],
+                                lg_conf["cv_tts7"],
+                                lg_conf["cv_tts8"],
+                                lg_conf["cv_tts9"],
+                                lg_conf["cv_tts10"],
+                                lg_conf["cv_tts11"],
+                                lg_conf["cv_tts12"],
+                            ]
+
+                            configs_storage = []
+
+                            for i in range(MAX_TTS):  # Loop from 00 to 11
+                                with gr.Accordion(CV_SUBTITLES[i], open=False):
+                                    gr.Markdown(TTS_TABS[i])
+                                    with gr.Column():
+                                        tag_gui = gr.Textbox(
+                                            value=TTS_TABS[i], visible=False
+                                        )
+                                        model_gui = model_conf()
+                                        pitch_algo_gui = pitch_algo_conf()
+                                        pitch_lvl_gui = pitch_lvl_conf()
+                                        index_gui = index_conf()
+                                        index_inf_gui = index_inf_conf()
+                                        rmf_gui = respiration_filter_conf()
+                                        er_gui = envelope_ratio_conf()
+                                        cbp_gui = consonant_protec_conf()
+
+                                        with gr.Row(variant="compact"):
+                                            button_config = button_conf(
+                                                TTS_TABS[i]
+                                            )
+
+                                            confirm_conf = gr.HTML()
+
+                                        button_config.click(
+                                            SoniTr.vci.apply_conf,
+                                            inputs=[
+                                                tag_gui,
+                                                model_gui,
+                                                pitch_algo_gui,
+                                                pitch_lvl_gui,
+                                                index_gui,
+                                                index_inf_gui,
+                                                rmf_gui,
+                                                er_gui,
+                                                cbp_gui,
+                                            ],
+                                            outputs=[confirm_conf],
+                                        )
+
+                                        configs_storage.append({
+                                            "tag": tag_gui,
+                                            "model": model_gui,
+                                            "index": index_gui,
+                                        })
+
+                with gr.Column():
+                    with gr.Accordion("Test R.V.C.", open=False):
+                        with gr.Row(variant="compact"):
+                            text_test = gr.Textbox(
+                                label="Text",
+                                value="This is an example",
+                                info="write a text",
+                                placeholder="...",
+                                lines=5,
+                            )
+                            with gr.Column():
+                                tts_test = gr.Dropdown(
+                                    sorted(SoniTr.tts_info.list_edge),
+                                    value="en-GB-ThomasNeural-Male",
+                                    label="TTS",
+                                    visible=True,
+                                    interactive=True,
+                                )
+                                model_test = model_conf()
+                                index_test = index_conf()
+                                pitch_test = pitch_lvl_conf()
+                                pitch_alg_test = pitch_algo_conf()
+                        with gr.Row(variant="compact"):
+                            button_test = gr.Button("Test audio")
+
+                        with gr.Column():
+                            with gr.Row():
+                                original_ttsvoice = gr.Audio()
+                                ttsvoice = gr.Audio()
+
+                            button_test.click(
+                                SoniTr.vci.make_test,
+                                inputs=[
+                                    text_test,
+                                    tts_test,
+                                    model_test,
+                                    index_test,
+                                    pitch_test,
+                                    pitch_alg_test,
+                                ],
+                                outputs=[ttsvoice, original_ttsvoice],
+                            )
+
+                    download_button.click(
+                        download_list,
+                        [url_links],
+                        [download_finish],
+                        queue=False
+                    ).then(
+                        update_models,
+                        [],
+                        [
+                            elem["model"] for elem in configs_storage
+                        ] + [model_test] + [
+                            elem["index"] for elem in configs_storage
+                        ] + [index_test],
+                    )
+
+        with gr.Tab(lg_conf["tab_help"]):
+            gr.Markdown(lg_conf["tutorial"])
+            gr.Markdown(news)
+
+            def play_sound_alert(play_sound):
+
+                if not play_sound:
+                    return None
+
+                # silent_sound = "assets/empty_audio.mp3"
+                sound_alert = "assets/sound_alert.mp3"
+
+                time.sleep(0.25)
+                # yield silent_sound
+                yield None
+
+                time.sleep(0.25)
+                yield sound_alert
+
+            sound_alert_notification = gr.Audio(
+                value=None,
+                type="filepath",
+                format="mp3",
+                autoplay=True,
+                visible=False,
+            )
+
+        if logs_in_gui:
+            logger.info("Logs in gui need public url")
+
+            class Logger:
+                def __init__(self, filename):
+                    self.terminal = sys.stdout
+                    self.log = open(filename, "w")
+
+                def write(self, message):
+                    self.terminal.write(message)
+                    self.log.write(message)
+
+                def flush(self):
+                    self.terminal.flush()
+                    self.log.flush()
+
+                def isatty(self):
+                    return False
+
+            sys.stdout = Logger("output.log")
+
+            def read_logs():
+                sys.stdout.flush()
+                with open("output.log", "r") as f:
+                    return f.read()
+
+            with gr.Accordion("Logs", open=False):
+                logs = gr.Textbox(label=">>>")
+                app.load(read_logs, None, logs, every=1)
+
+        if SoniTr.tts_info.xtts_enabled:
+            # Update tts list
+            def update_tts_list():
+                update_dict = {
+                    f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list())
+                    for i in range(MAX_TTS)
+                }
+                update_dict["tts_documents"] = gr.update(
+                    choices=list(
+                        filter(
+                            lambda x: x != "_XTTS_/AUTOMATIC.wav",
+                            SoniTr.tts_info.tts_list(),
+                        )
+                    )
+                )
+                return [value for value in update_dict.values()]
+
+            create_xtts_wav.click(
+                create_wav_file_vc,
+                inputs=[
+                    wav_speaker_name,
+                    wav_speaker_file,
+                    wav_speaker_start,
+                    wav_speaker_end,
+                    wav_speaker_dir,
+                    wav_speaker_dereverb,
+                ],
+                outputs=[wav_speaker_output],
+            ).then(
+                update_tts_list,
+                None,
+                [
+                    tts_voice00,
+                    tts_voice01,
+                    tts_voice02,
+                    tts_voice03,
+                    tts_voice04,
+                    tts_voice05,
+                    tts_voice06,
+                    tts_voice07,
+                    tts_voice08,
+                    tts_voice09,
+                    tts_voice10,
+                    tts_voice11,
+                    tts_documents,
+                ],
+            )
+
+        # Run translate text
+        subs_button.click(
+            SoniTr.batch_multilingual_media_conversion,
+            inputs=[
+                video_input,
+                blink_input,
+                directory_input,
+                HFKEY,
+                PREVIEW,
+                WHISPER_MODEL_SIZE,
+                batch_size,
+                compute_type,
+                SOURCE_LANGUAGE,
+                TRANSLATE_AUDIO_TO,
+                min_speakers,
+                max_speakers,
+                tts_voice00,
+                tts_voice01,
+                tts_voice02,
+                tts_voice03,
+                tts_voice04,
+                tts_voice05,
+                tts_voice06,
+                tts_voice07,
+                tts_voice08,
+                tts_voice09,
+                tts_voice10,
+                tts_voice11,
+                VIDEO_OUTPUT_NAME,
+                AUDIO_MIX,
+                audio_accelerate,
+                acceleration_rate_regulation_gui,
+                volume_original_mix,
+                volume_translated_mix,
+                sub_type_output,
+                edit_sub_check,  # TRUE BY DEFAULT
+                dummy_false_check,  # dummy false
+                subs_edit_space,
+                avoid_overlap_gui,
+                vocal_refinement_gui,
+                literalize_numbers_gui,
+                segment_duration_limit_gui,
+                diarization_process_dropdown,
+                translate_process_dropdown,
+                input_srt,
+                main_output_type,
+                main_voiceless_track,
+                voice_imitation_gui,
+                voice_imitation_max_segments_gui,
+                voice_imitation_vocals_dereverb_gui,
+                voice_imitation_remove_previous_gui,
+                voice_imitation_method_gui,
+                wav_speaker_dereverb,
+                text_segmentation_scale_gui,
+                divide_text_segments_by_gui,
+                soft_subtitles_to_video_gui,
+                burn_subtitles_to_video_gui,
+                enable_cache_gui,
+                enable_custom_voice,
+                workers_custom_voice,
+                is_gui_dummy_check,
+            ],
+            outputs=subs_edit_space,
+        ).then(
+            play_sound_alert, [play_sound_gui], [sound_alert_notification]
+        )
+
+        # Run translate tts and complete
+        video_button.click(
+            SoniTr.batch_multilingual_media_conversion,
+            inputs=[
+                video_input,
+                blink_input,
+                directory_input,
+                HFKEY,
+                PREVIEW,
+                WHISPER_MODEL_SIZE,
+                batch_size,
+                compute_type,
+                SOURCE_LANGUAGE,
+                TRANSLATE_AUDIO_TO,
+                min_speakers,
+                max_speakers,
+                tts_voice00,
+                tts_voice01,
+                tts_voice02,
+                tts_voice03,
+                tts_voice04,
+                tts_voice05,
+                tts_voice06,
+                tts_voice07,
+                tts_voice08,
+                tts_voice09,
+                tts_voice10,
+                tts_voice11,
+                VIDEO_OUTPUT_NAME,
+                AUDIO_MIX,
+                audio_accelerate,
+                acceleration_rate_regulation_gui,
+                volume_original_mix,
+                volume_translated_mix,
+                sub_type_output,
+                dummy_false_check,
+                edit_sub_check,
+                subs_edit_space,
+                avoid_overlap_gui,
+                vocal_refinement_gui,
+                literalize_numbers_gui,
+                segment_duration_limit_gui,
+                diarization_process_dropdown,
+                translate_process_dropdown,
+                input_srt,
+                main_output_type,
+                main_voiceless_track,
+                voice_imitation_gui,
+                voice_imitation_max_segments_gui,
+                voice_imitation_vocals_dereverb_gui,
+                voice_imitation_remove_previous_gui,
+                voice_imitation_method_gui,
+                wav_speaker_dereverb,
+                text_segmentation_scale_gui,
+                divide_text_segments_by_gui,
+                soft_subtitles_to_video_gui,
+                burn_subtitles_to_video_gui,
+                enable_cache_gui,
+                enable_custom_voice,
+                workers_custom_voice,
+                is_gui_dummy_check,
+            ],
+            outputs=video_output,
+            trigger_mode="multiple",
+        ).then(
+            play_sound_alert, [play_sound_gui], [sound_alert_notification]
+        )
+
+        # Run docs process
+        docs_button.click(
+            SoniTr.multilingual_docs_conversion,
+            inputs=[
+                text_docs,
+                input_docs,
+                directory_input_docs,
+                docs_SOURCE_LANGUAGE,
+                docs_TRANSLATE_TO,
+                tts_documents,
+                docs_OUTPUT_NAME,
+                docs_translate_process_dropdown,
+                docs_output_type,
+                docs_chunk_size,
+                enable_custom_voice,
+                workers_custom_voice,
+                start_page_gui,
+                end_page_gui,
+                videobook_width_gui,
+                videobook_height_gui,
+                videobook_bcolor_gui,
+                docs_dummy_check,
+            ],
+            outputs=docs_output,
+            trigger_mode="multiple",
+        ).then(
+            play_sound_alert, [play_sound_gui], [sound_alert_notification]
+        )
+
+    return app
+
+
+def get_language_config(language_data, language=None, base_key="english"):
+    base_lang = language_data.get(base_key)
+
+    if language not in language_data:
+        logger.error(
+            f"Language {language} not found, defaulting to {base_key}"
+        )
+        return base_lang
+
+    lg_conf = language_data.get(language, {})
+    lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf)
+
+    return lg_conf
+
+
+def create_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--theme",
+        type=str,
+        default="Taithrah/Minimal",
+        help=(
+            "Specify the theme; find themes in "
+            "https://huggingface.co/spaces/gradio/theme-gallery;"
+            " Example: --theme aliabid94/new-theme"
+        ),
+    )
+    parser.add_argument(
+        "--public_url",
+        action="store_true",
+        default=False,
+        help="Enable public link",
+    )
+    parser.add_argument(
+        "--logs_in_gui",
+        action="store_true",
+        default=False,
+        help="Displays the operations performed in Logs",
+    )
+    parser.add_argument(
+        "--verbosity_level",
+        type=str,
+        default="info",
+        help=(
+            "Set logger verbosity level: "
+            "debug, info, warning, error, or critical"
+        ),
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        default="english",
+        help=" Select the language of the interface: english, spanish",
+    )
+    parser.add_argument(
+        "--cpu_mode",
+        action="store_true",
+        default=False,
+        help="Enable CPU mode to run the program without utilizing GPU acceleration.",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+
+    parser = create_parser()
+
+    args = parser.parse_args()
+    # Simulating command-line arguments
+    # args_list = "--theme aliabid94/new-theme --public_url".split()
+    # args = parser.parse_args(args_list)
+
+    set_logging_level(args.verbosity_level)
+
+    for id_model in UVR_MODELS:
+        download_manager(
+            os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
+        )
+
+    models_path, index_path = upload_model_list()
+
+    SoniTr = SoniTranslate(cpu_mode=args.cpu_mode)
+
+    lg_conf = get_language_config(language_data, language=args.language)
+
+    app = create_gui(args.theme, logs_in_gui=args.logs_in_gui)
+
+    app.queue()
+
+    app.launch(
+        max_threads=1,
+        share=args.public_url,
+        show_error=True,
+        quiet=False,
+        debug=(True if logger.isEnabledFor(logging.DEBUG) else False),
+    )