import gradio as gr
import os
os.system("pip install -q piper-tts==1.2.0")
os.system("pip install -q -r requirements_xtts.txt")
os.system("pip install -q TTS==0.21.1  --no-deps")
import spaces
import torch
if os.environ.get("ZERO_GPU") != "TRUE" and torch.cuda.is_available():
    # onnxruntime GPU
    os.system("pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/")
import librosa
from soni_translate.logging_setup import (
    logger,
    set_logging_level,
    configure_logging_libs,
); configure_logging_libs() # noqa
import whisperx
import os
from soni_translate.audio_segments import create_translated_audio
from soni_translate.text_to_speech import (
    audio_segmentation_to_voice,
    edge_tts_voices_list,
    coqui_xtts_voices_list,
    piper_tts_voices_list,
    create_wav_file_vc,
    accelerate_segments,
)
from soni_translate.translate_segments import (
    translate_text,
    TRANSLATION_PROCESS_OPTIONS,
    DOCS_TRANSLATION_PROCESS_OPTIONS
)
from soni_translate.preprocessor import (
    audio_video_preprocessor,
    audio_preprocessor,
)
from soni_translate.postprocessor import (
    OUTPUT_TYPE_OPTIONS,
    DOCS_OUTPUT_TYPE_OPTIONS,
    sound_separate,
    get_no_ext_filename,
    media_out,
    get_subtitle_speaker,
)
from soni_translate.language_configuration import (
    LANGUAGES,
    UNIDIRECTIONAL_L_LIST,
    LANGUAGES_LIST,
    BARK_VOICES_LIST,
    VITS_VOICES_LIST,
    OPENAI_TTS_MODELS,
)
from soni_translate.utils import (
    remove_files,
    download_list,
    upload_model_list,
    download_manager,
    run_command,
    is_audio_file,
    is_subtitle_file,
    copy_files,
    get_valid_files,
    get_link_list,
    remove_directory_contents,
)
from soni_translate.mdx_net import (
    UVR_MODELS,
    MDX_DOWNLOAD_LINK,
    mdxnet_models_dir,
)
from soni_translate.speech_segmentation import (
    ASR_MODEL_OPTIONS,
    COMPUTE_TYPE_GPU,
    COMPUTE_TYPE_CPU,
    find_whisper_models,
    transcribe_speech,
    align_speech,
    diarize_speech,
    diarization_models,
)
from soni_translate.text_multiformat_processor import (
    BORDER_COLORS,
    srt_file_to_segments,
    document_preprocessor,
    determine_chunk_size,
    plain_text_to_segments,
    segments_to_plain_text,
    process_subtitles,
    linguistic_level_segments,
    break_aling_segments,
    doc_to_txtximg_pages,
    page_data_to_segments,
    update_page_data,
    fix_timestamps_docs,
    create_video_from_images,
    merge_video_and_audio,
)
from soni_translate.languages_gui import language_data, news
import copy
import logging
import json
from pydub import AudioSegment
from voice_main import ClassVoices
import argparse
import time
import hashlib
import sys

directories = [
    "downloads",
    "logs",
    "weights",
    "clean_song_output",
    "_XTTS_",
    f"audio2{os.sep}audio",
    "audio",
    "outputs",
]
[
    os.makedirs(directory)
    for directory in directories
    if not os.path.exists(directory)
]


class TTS_Info:
    def __init__(self, piper_enabled, xtts_enabled):
        self.list_edge = edge_tts_voices_list()
        self.list_bark = list(BARK_VOICES_LIST.keys())
        self.list_vits = list(VITS_VOICES_LIST.keys())
        self.list_openai_tts = OPENAI_TTS_MODELS
        self.piper_enabled = piper_enabled
        self.list_vits_onnx = (
            piper_tts_voices_list() if self.piper_enabled else []
        )
        self.xtts_enabled = xtts_enabled

    def tts_list(self):
        self.list_coqui_xtts = (
            coqui_xtts_voices_list() if self.xtts_enabled else []
        )
        list_tts = self.list_coqui_xtts + sorted(
            self.list_edge
            + (self.list_bark if os.environ.get("ZERO_GPU") != "TRUE" else [])
            + self.list_vits
            + self.list_openai_tts
            + self.list_vits_onnx
        )
        return list_tts


def prog_disp(msg, percent, is_gui, progress=None):
    logger.info(msg)
    if is_gui:
        progress(percent, desc=msg)


def warn_disp(wrn_lang, is_gui):
    logger.warning(wrn_lang)
    if is_gui:
        gr.Warning(wrn_lang)


class SoniTrCache:
    def __init__(self):
        self.cache = {
            'media': [[]],
            'refine_vocals': [],
            'transcript_align': [],
            'break_align': [],
            'diarize': [],
            'translate': [],
            'subs_and_edit': [],
            'tts': [],
            'acc_and_vc': [],
            'mix_aud': [],
            'output': []
        }

        self.cache_data = {
            'media': [],
            'refine_vocals': [],
            'transcript_align': [],
            'break_align': [],
            'diarize': [],
            'translate': [],
            'subs_and_edit': [],
            'tts': [],
            'acc_and_vc': [],
            'mix_aud': [],
            'output': []
        }

        self.cache_keys = list(self.cache.keys())
        self.first_task = self.cache_keys[0]
        self.last_task = self.cache_keys[-1]

        self.pre_step = None
        self.pre_params = []

    def set_variable(self, variable_name, value):
        setattr(self, variable_name, value)

    def task_in_cache(self, step: str, params: list, previous_step_data: dict):

        self.pre_step_cache = None

        if step == self.first_task:
            self.pre_step = None

        if self.pre_step:
            self.cache[self.pre_step] = self.pre_params

            # Fill data in cache
            self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data)

        self.pre_params = params
        # logger.debug(f"Step: {str(step)}, Cache params: {str(self.cache)}")
        if params == self.cache[step]:
            logger.debug(f"In cache: {str(step)}")

            # Set the var needed for next step
            # Recovery from cache_data the current step
            for key, value in self.cache_data[step].items():
                self.set_variable(key, copy.deepcopy(value))
                logger.debug(
                    f"Chache load: {str(key)}"
                )

            self.pre_step = step
            return True

        else:
            logger.debug(f"Flush next and caching {str(step)}")
            selected_index = self.cache_keys.index(step)

            for idx, key in enumerate(self.cache.keys()):
                if idx >= selected_index:
                    self.cache[key] = []
                    self.cache_data[key] = {}

            # The last is now previous
            self.pre_step = step
            return False

    def clear_cache(self, media, force=False):

        self.cache["media"] = (
            self.cache["media"] if len(self.cache["media"]) else [[]]
        )

        if media != self.cache["media"][0] or force:

            # Clear cache
            self.cache = {key: [] for key in self.cache}
            self.cache["media"] = [[]]

            logger.info("Cache flushed")


def get_hash(filepath):
    with open(filepath, 'rb') as f:
        file_hash = hashlib.blake2b()
        while chunk := f.read(8192):
            file_hash.update(chunk)

    return file_hash.hexdigest()[:18]


def check_openai_api_key():
    if not os.environ.get("OPENAI_API_KEY"):
        raise ValueError(
            "To use GPT for translation, please set up your OpenAI API key "
            "as an environment variable in Linux as follows: "
            "export OPENAI_API_KEY='your-api-key-here'. Or change the "
            "translation process in Advanced settings."
        )


class SoniTranslate(SoniTrCache):
    def __init__(self, cpu_mode=False):
        super().__init__()
        if cpu_mode:
            os.environ["SONITR_DEVICE"] = "cpu"
        else:
            os.environ["SONITR_DEVICE"] = (
                "cuda" if torch.cuda.is_available() else "cpu"
            )

        self.device = os.environ.get("SONITR_DEVICE")
        self.device = self.device if os.environ.get("ZERO_GPU") != "TRUE" else "cuda"
        self.result_diarize = None
        self.align_language = None
        self.result_source_lang = None
        self.edit_subs_complete = False
        self.voiceless_id = None
        self.burn_subs_id = None

        self.vci = ClassVoices(only_cpu=cpu_mode)

        self.tts_voices = self.get_tts_voice_list()

        logger.info(f"Working in: {self.device}")

    def get_tts_voice_list(self):
        try:
            from piper import PiperVoice  # noqa

            piper_enabled = True
            logger.info("PIPER TTS enabled")
        except Exception as error:
            logger.debug(str(error))
            piper_enabled = False
            logger.info("PIPER TTS disabled")
        try:
            from TTS.api import TTS  # noqa

            xtts_enabled = True
            logger.info("Coqui XTTS enabled")
            logger.info(
                "In this app, by using Coqui TTS (text-to-speech), you "
                "acknowledge and agree to the license.\n"
                "You confirm that you have read, understood, and agreed "
                "to the Terms and Conditions specified at the following "
                "link:\nhttps://coqui.ai/cpml.txt."
            )
            os.environ["COQUI_TOS_AGREED"] = "1"
        except Exception as error:
            logger.debug(str(error))
            xtts_enabled = False
            logger.info("Coqui XTTS disabled")

        self.tts_info = TTS_Info(piper_enabled, xtts_enabled)

        return self.tts_info.tts_list()

    def batch_multilingual_media_conversion(self, *kwargs):
        # logger.debug(str(kwargs))

        media_file_arg = kwargs[0] if kwargs[0] is not None else []

        link_media_arg = kwargs[1]
        link_media_arg = [x.strip() for x in link_media_arg.split(',')]
        link_media_arg = get_link_list(link_media_arg)

        path_arg = kwargs[2]
        path_arg = [x.strip() for x in path_arg.split(',')]
        path_arg = get_valid_files(path_arg)

        edit_text_arg = kwargs[31]
        get_text_arg = kwargs[32]

        is_gui_arg = kwargs[-1]

        kwargs = kwargs[3:]

        media_batch = media_file_arg + link_media_arg + path_arg
        media_batch = list(filter(lambda x: x != "", media_batch))
        media_batch = media_batch if media_batch else [None]
        logger.debug(str(media_batch))

        remove_directory_contents("outputs")

        if edit_text_arg or get_text_arg:
            return self.multilingual_media_conversion(
                media_batch[0], "", "", *kwargs
            )

        if "SET_LIMIT" == os.getenv("DEMO") or "TRUE" == os.getenv("ZERO_GPU"):
            media_batch = [media_batch[0]]

        result = []
        for media in media_batch:
            # Call the nested function with the parameters
            output_file = self.multilingual_media_conversion(
                media, "", "", *kwargs
            )

            if isinstance(output_file, str):
                output_file = [output_file]
            result.extend(output_file)

            if is_gui_arg and len(media_batch) > 1:
                gr.Info(f"Done: {os.path.basename(output_file[0])}")

        return result

    def multilingual_media_conversion(
        self,
        media_file=None,
        link_media="",
        directory_input="",
        YOUR_HF_TOKEN="",
        preview=False,
        transcriber_model="large-v3",
        batch_size=4,
        compute_type="auto",
        origin_language="Automatic detection",
        target_language="English (en)",
        min_speakers=1,
        max_speakers=1,
        tts_voice00="en-US-EmmaMultilingualNeural-Female",
        tts_voice01="en-US-AndrewMultilingualNeural-Male",
        tts_voice02="en-US-AvaMultilingualNeural-Female",
        tts_voice03="en-US-BrianMultilingualNeural-Male",
        tts_voice04="de-DE-SeraphinaMultilingualNeural-Female",
        tts_voice05="de-DE-FlorianMultilingualNeural-Male",
        tts_voice06="fr-FR-VivienneMultilingualNeural-Female",
        tts_voice07="fr-FR-RemyMultilingualNeural-Male",
        tts_voice08="en-US-EmmaMultilingualNeural-Female",
        tts_voice09="en-US-AndrewMultilingualNeural-Male",
        tts_voice10="en-US-EmmaMultilingualNeural-Female",
        tts_voice11="en-US-AndrewMultilingualNeural-Male",
        video_output_name="",
        mix_method_audio="Adjusting volumes and mixing audio",
        max_accelerate_audio=2.1,
        acceleration_rate_regulation=False,
        volume_original_audio=0.25,
        volume_translated_audio=1.80,
        output_format_subtitle="srt",
        get_translated_text=False,
        get_video_from_text_json=False,
        text_json="{}",
        avoid_overlap=False,
        vocal_refinement=False,
        literalize_numbers=True,
        segment_duration_limit=15,
        diarization_model="pyannote_2.1",
        translate_process="google_translator_batch",
        subtitle_file=None,
        output_type="video (mp4)",
        voiceless_track=False,
        voice_imitation=False,
        voice_imitation_max_segments=3,
        voice_imitation_vocals_dereverb=False,
        voice_imitation_remove_previous=True,
        voice_imitation_method="freevc",
        dereverb_automatic_xtts=True,
        text_segmentation_scale="sentence",
        divide_text_segments_by="",
        soft_subtitles_to_video=True,
        burn_subtitles_to_video=False,
        enable_cache=True,
        custom_voices=False,
        custom_voices_workers=1,
        is_gui=False,
        progress=gr.Progress(),
    ):
        if not YOUR_HF_TOKEN:
            YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
            if diarization_model == "disable" or max_speakers == 1:
                if YOUR_HF_TOKEN is None:
                    YOUR_HF_TOKEN = ""
            elif not YOUR_HF_TOKEN:
                raise ValueError("No valid Hugging Face token")
            else:
                os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN

        if (
            "gpt" in translate_process
            or transcriber_model == "OpenAI_API_Whisper"
            or "OpenAI-TTS" in tts_voice00
        ):
            check_openai_api_key()

        if media_file is None:
            media_file = (
                directory_input
                if os.path.exists(directory_input)
                else link_media
            )
        media_file = (
            media_file if isinstance(media_file, str) else media_file.name
        )

        if is_subtitle_file(media_file):
            subtitle_file = media_file
            media_file = ""

        if media_file is None:
            media_file = ""

        if not origin_language:
            origin_language = "Automatic detection"

        if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file:
            raise ValueError(
                f"The language '{origin_language}' "
                "is not supported for transcription (ASR)."
            )

        if get_translated_text:
            self.edit_subs_complete = False
        if get_video_from_text_json:
            if not self.edit_subs_complete:
                raise ValueError("Generate the transcription first.")

        if (
            ("sound" in output_type or output_type == "raw media")
            and (get_translated_text or get_video_from_text_json)
        ):
            raise ValueError(
                "Please disable 'edit generate subtitles' "
                f"first to acquire the {output_type}."
            )

        TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
        SOURCE_LANGUAGE = LANGUAGES[origin_language]

        if (
            transcriber_model == "OpenAI_API_Whisper"
            and SOURCE_LANGUAGE == "zh-TW"
        ):
            logger.warning(
                "OpenAI API Whisper only supports Chinese (Simplified)."
            )
            SOURCE_LANGUAGE = "zh"

        if (
            text_segmentation_scale in ["word", "character"]
            and "subtitle" not in output_type
        ):
            wrn_lang = (
                "Text segmentation by words or characters is typically"
                " used for generating subtitles. If subtitles are not the"
                " intended output, consider selecting 'sentence' "
                "segmentation method to ensure optimal results."

            )
            warn_disp(wrn_lang, is_gui)

        if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
            wrn_lang = (
                "Make sure to select a 'TTS Speaker' suitable for"
                " the translation language to avoid errors with the TTS."
            )
            warn_disp(wrn_lang, is_gui)

        if "_XTTS_" in tts_voice00 and voice_imitation:
            wrn_lang = (
                "When you select XTTS, it is advisable "
                "to disable Voice Imitation."
            )
            warn_disp(wrn_lang, is_gui)

        if custom_voices and voice_imitation:
            wrn_lang = (
                "When you use R.V.C. models, it is advisable"
                " to disable Voice Imitation."
            )
            warn_disp(wrn_lang, is_gui)

        if not media_file and not subtitle_file:
            raise ValueError(
                "Specifify a media or SRT file in advanced settings"
            )

        if subtitle_file:
            subtitle_file = (
                subtitle_file
                if isinstance(subtitle_file, str)
                else subtitle_file.name
            )

        if subtitle_file and SOURCE_LANGUAGE == "Automatic detection":
            raise Exception(
                "To use an SRT file, you need to specify its "
                "original language (Source language)"
            )

        if not media_file and subtitle_file:
            diarization_model = "disable"
            media_file = "audio_support.wav"
            if not get_video_from_text_json:
                remove_files(media_file)
                srt_data = srt_file_to_segments(subtitle_file)
                total_duration = srt_data["segments"][-1]["end"] + 30.
                support_audio = AudioSegment.silent(
                    duration=int(total_duration * 1000)
                )
                support_audio.export(
                    media_file, format="wav"
                )
                logger.info("Supporting audio for the SRT file, created.")

        if "SET_LIMIT" == os.getenv("DEMO"):
            preview = True
            mix_method_audio = "Adjusting volumes and mixing audio"
            transcriber_model = "medium"
            logger.info(
                "DEMO; set preview=True; Generation is limited to "
                "10 seconds to prevent CPU errors. No limitations with GPU.\n"
                "DEMO; set Adjusting volumes and mixing audio\n"
                "DEMO; set whisper model to medium"
            )

        # Check GPU
        if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU:
            logger.info("Compute type changed to float32")
            compute_type = "float32"

        base_video_file = "Video.mp4"
        base_audio_wav = "audio.wav"
        dub_audio_file = "audio_dub_solo.ogg"
        vocals_audio_file = "audio_Vocals_DeReverb.wav"
        voiceless_audio_file = "audio_Voiceless.wav"
        mix_audio_file = "audio_mix.mp3"
        vid_subs = "video_subs_file.mp4"
        video_output_file = "video_dub.mp4"

        if os.path.exists(media_file):
            media_base_hash = get_hash(media_file)
        else:
            media_base_hash = media_file
        self.clear_cache(media_base_hash, force=(not enable_cache))

        if not get_video_from_text_json:
            self.result_diarize = (
                self.align_language
            ) = self.result_source_lang = None
            if not self.task_in_cache("media", [media_base_hash, preview], {}):
                if is_audio_file(media_file):
                    prog_disp(
                        "Processing audio...", 0.15, is_gui, progress=progress
                    )
                    audio_preprocessor(preview, media_file, base_audio_wav)
                else:
                    prog_disp(
                        "Processing video...", 0.15, is_gui, progress=progress
                    )
                    audio_video_preprocessor(
                        preview, media_file, base_video_file, base_audio_wav
                    )
                logger.debug("Set file complete.")

            if "sound" in output_type:
                prog_disp(
                    "Separating sounds in the file...",
                    0.50,
                    is_gui,
                    progress=progress
                )
                separate_out = sound_separate(base_audio_wav, output_type)
                final_outputs = []
                for out in separate_out:
                    final_name = media_out(
                        media_file,
                        f"{get_no_ext_filename(out)}",
                        video_output_name,
                        "wav",
                        file_obj=out,
                    )
                    final_outputs.append(final_name)
                logger.info(f"Done: {str(final_outputs)}")
                return final_outputs

            if output_type == "raw media":
                output = media_out(
                    media_file,
                    "raw_media",
                    video_output_name,
                    "wav" if is_audio_file(media_file) else "mp4",
                    file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
                )
                logger.info(f"Done: {output}")
                return output

            if os.environ.get("IS_DEMO") == "TRUE":
                duration_verify = librosa.get_duration(filename=base_audio_wav)
                logger.info(f"Duration: {duration_verify} seconds")
                if duration_verify > 1500:
                    raise RuntimeError(
                        "The audio is too long to process in this demo. Alternatively, you"
                        " can install the app locally or use the Colab notebook available "
                        "in the SoniTranslate repository."
                    )
                elif duration_verify > 300:
                    tts_voices_list = [
                        tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04,
                        tts_voice05, tts_voice06, tts_voice07, tts_voice08, tts_voice09,
                        tts_voice10, tts_voice11
                    ]
                    
                    for tts_voice_ in tts_voices_list:
                        if "_XTTS_" in tts_voice_:
                            raise RuntimeError(
                                "XTTS is too slow to be used for audio longer than 5 "
                                "minutes in this demo. Alternatively, you can install "
                                "the app locally or use the Colab notebook available in"
                                " the SoniTranslate repository."
                            )
            
            if not self.task_in_cache("refine_vocals", [vocal_refinement], {}):
                self.vocals = None
                if vocal_refinement:
                    try:
                        from soni_translate.mdx_net import process_uvr_task
                        _, _, _, _, file_vocals = process_uvr_task(
                            orig_song_path=base_audio_wav,
                            main_vocals=False,
                            dereverb=True,
                            remove_files_output_dir=True,
                        )
                        remove_files(vocals_audio_file)
                        copy_files(file_vocals, ".")
                        self.vocals = vocals_audio_file
                    except Exception as error:
                        logger.error(str(error))

            if not self.task_in_cache("transcript_align", [
                subtitle_file,
                SOURCE_LANGUAGE,
                transcriber_model,
                compute_type,
                batch_size,
                literalize_numbers,
                segment_duration_limit,
                (
                    "l_unit"
                    if text_segmentation_scale in ["word", "character"]
                    and subtitle_file
                    else "sentence"
                )
            ], {"vocals": self.vocals}):
                if subtitle_file:
                    prog_disp(
                        "From SRT file...", 0.30, is_gui, progress=progress
                    )
                    audio = whisperx.load_audio(
                        base_audio_wav if not self.vocals else self.vocals
                    )
                    self.result = srt_file_to_segments(subtitle_file)
                    self.result["language"] = SOURCE_LANGUAGE
                else:
                    prog_disp(
                        "Transcribing...", 0.30, is_gui, progress=progress
                    )
                    SOURCE_LANGUAGE = (
                        None
                        if SOURCE_LANGUAGE == "Automatic detection"
                        else SOURCE_LANGUAGE
                    )
                    audio, self.result = transcribe_speech(
                        base_audio_wav if not self.vocals else self.vocals,
                        transcriber_model,
                        compute_type,
                        batch_size,
                        SOURCE_LANGUAGE,
                        literalize_numbers,
                        segment_duration_limit,
                    )
                logger.debug(
                    "Transcript complete, "
                    f"segments count {len(self.result['segments'])}"
                )

                self.align_language = self.result["language"]
                if (
                    not subtitle_file
                    or text_segmentation_scale in ["word", "character"]
                ):
                    prog_disp("Aligning...", 0.45, is_gui, progress=progress)
                    try:
                        if self.align_language in ["vi"]:
                            logger.info(
                                "Deficient alignment for the "
                                f"{self.align_language} language, skipping the"
                                " process. It is suggested to reduce the "
                                "duration of the segments as an alternative."
                            )
                        else:
                            self.result = align_speech(audio, self.result)
                            logger.debug(
                                "Align complete, "
                                f"segments count {len(self.result['segments'])}"
                            )
                    except Exception as error:
                        logger.error(str(error))

            if self.result["segments"] == []:
                raise ValueError("No active speech found in audio")

            if not self.task_in_cache("break_align", [
                divide_text_segments_by,
                text_segmentation_scale,
                self.align_language
            ], {
                "result": self.result,
                "align_language": self.align_language
            }):
                if self.align_language in ["ja", "zh", "zh-TW"]:
                    divide_text_segments_by += "|!|?|...|。"
                if text_segmentation_scale in ["word", "character"]:
                    self.result = linguistic_level_segments(
                        self.result,
                        text_segmentation_scale,
                    )
                elif divide_text_segments_by:
                    try:
                        self.result = break_aling_segments(
                            self.result,
                            break_characters=divide_text_segments_by,
                        )
                    except Exception as error:
                        logger.error(str(error))

            if not self.task_in_cache("diarize", [
                min_speakers,
                max_speakers,
                YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2],
                diarization_model
            ], {
                "result": self.result
            }):
                prog_disp("Diarizing...", 0.60, is_gui, progress=progress)
                diarize_model_select = diarization_models[diarization_model]
                self.result_diarize = diarize_speech(
                    base_audio_wav if not self.vocals else self.vocals,
                    self.result,
                    min_speakers,
                    max_speakers,
                    YOUR_HF_TOKEN,
                    diarize_model_select,
                )
                logger.debug("Diarize complete")
            self.result_source_lang = copy.deepcopy(self.result_diarize)

            if not self.task_in_cache("translate", [
                TRANSLATE_AUDIO_TO,
                translate_process
            ], {
                "result_diarize": self.result_diarize
            }):
                prog_disp("Translating...", 0.70, is_gui, progress=progress)
                lang_source = (
                    self.align_language
                    if self.align_language
                    else SOURCE_LANGUAGE
                )
                self.result_diarize["segments"] = translate_text(
                    self.result_diarize["segments"],
                    TRANSLATE_AUDIO_TO,
                    translate_process,
                    chunk_size=1800,
                    source=lang_source,
                )
                logger.debug("Translation complete")
                logger.debug(self.result_diarize)

        if get_translated_text:

            json_data = []
            for segment in self.result_diarize["segments"]:
                start = segment["start"]
                text = segment["text"]
                speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1
                json_data.append(
                    {"start": start, "text": text, "speaker": speaker}
                )

            # Convert list of dictionaries to a JSON string with indentation
            json_string = json.dumps(json_data, indent=2)
            logger.info("Done")
            self.edit_subs_complete = True
            return json_string.encode().decode("unicode_escape")

        if get_video_from_text_json:

            if self.result_diarize is None:
                raise ValueError("Generate the transcription first.")
            # with open('text_json.json', 'r') as file:
            text_json_loaded = json.loads(text_json)
            for i, segment in enumerate(self.result_diarize["segments"]):
                segment["text"] = text_json_loaded[i]["text"]
                segment["speaker"] = "SPEAKER_{:02d}".format(
                    int(text_json_loaded[i]["speaker"]) - 1
                )

        # Write subtitle
        if not self.task_in_cache("subs_and_edit", [
            copy.deepcopy(self.result_diarize),
            output_format_subtitle,
            TRANSLATE_AUDIO_TO
        ], {
            "result_diarize": self.result_diarize
        }):
            if output_format_subtitle == "disable":
                self.sub_file = "sub_tra.srt"
            elif output_format_subtitle != "ass":
                self.sub_file = process_subtitles(
                    self.result_source_lang,
                    self.align_language,
                    self.result_diarize,
                    output_format_subtitle,
                    TRANSLATE_AUDIO_TO,
                )

            # Need task
            if output_format_subtitle != "srt":
                _ = process_subtitles(
                    self.result_source_lang,
                    self.align_language,
                    self.result_diarize,
                    "srt",
                    TRANSLATE_AUDIO_TO,
                )

            if output_format_subtitle == "ass":
                convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y"
                convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y"
                self.sub_file = "sub_tra.ass"
                run_command(convert_ori)
                run_command(convert_tra)

        format_sub = (
            output_format_subtitle
            if output_format_subtitle != "disable"
            else "srt"
        )

        if output_type == "subtitle":

            out_subs = []
            tra_subs = media_out(
                media_file,
                TRANSLATE_AUDIO_TO,
                video_output_name,
                format_sub,
                file_obj=self.sub_file,
            )
            out_subs.append(tra_subs)

            ori_subs = media_out(
                media_file,
                self.align_language,
                video_output_name,
                format_sub,
                file_obj=f"sub_ori.{format_sub}",
            )
            out_subs.append(ori_subs)
            logger.info(f"Done: {out_subs}")
            return out_subs

        if output_type == "subtitle [by speaker]":
            output = get_subtitle_speaker(
                media_file,
                result=self.result_diarize,
                language=TRANSLATE_AUDIO_TO,
                extension=format_sub,
                base_name=video_output_name,
            )
            logger.info(f"Done: {str(output)}")
            return output

        if "video [subtitled]" in output_type:
            output = media_out(
                media_file,
                TRANSLATE_AUDIO_TO + "_subtitled",
                video_output_name,
                "wav" if is_audio_file(media_file) else (
                    "mkv" if "mkv" in output_type else "mp4"
                ),
                file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
                soft_subtitles=False if is_audio_file(media_file) else True,
                subtitle_files=output_format_subtitle,
            )
            msg_out = output[0] if isinstance(output, list) else output
            logger.info(f"Done: {msg_out}")
            return output

        if not self.task_in_cache("tts", [
            TRANSLATE_AUDIO_TO,
            tts_voice00,
            tts_voice01,
            tts_voice02,
            tts_voice03,
            tts_voice04,
            tts_voice05,
            tts_voice06,
            tts_voice07,
            tts_voice08,
            tts_voice09,
            tts_voice10,
            tts_voice11,
            dereverb_automatic_xtts
        ], {
            "sub_file": self.sub_file
        }):
            prog_disp("Text to speech...", 0.80, is_gui, progress=progress)
            self.valid_speakers = audio_segmentation_to_voice(
                self.result_diarize,
                TRANSLATE_AUDIO_TO,
                is_gui,
                tts_voice00,
                tts_voice01,
                tts_voice02,
                tts_voice03,
                tts_voice04,
                tts_voice05,
                tts_voice06,
                tts_voice07,
                tts_voice08,
                tts_voice09,
                tts_voice10,
                tts_voice11,
                dereverb_automatic_xtts,
            )

        if not self.task_in_cache("acc_and_vc", [
            max_accelerate_audio,
            acceleration_rate_regulation,
            voice_imitation,
            voice_imitation_max_segments,
            voice_imitation_remove_previous,
            voice_imitation_vocals_dereverb,
            voice_imitation_method,
            custom_voices,
            custom_voices_workers,
            copy.deepcopy(self.vci.model_config),
            avoid_overlap
        ], {
            "valid_speakers": self.valid_speakers
        }):
            audio_files, speakers_list = accelerate_segments(
                    self.result_diarize,
                    max_accelerate_audio,
                    self.valid_speakers,
                    acceleration_rate_regulation,
                )

            # Voice Imitation (Tone color converter)
            if voice_imitation:
                prog_disp(
                    "Voice Imitation...", 0.85, is_gui, progress=progress
                )
                from soni_translate.text_to_speech import toneconverter

                try:
                    toneconverter(
                        copy.deepcopy(self.result_diarize),
                        voice_imitation_max_segments,
                        voice_imitation_remove_previous,
                        voice_imitation_vocals_dereverb,
                        voice_imitation_method,
                    )
                except Exception as error:
                    logger.error(str(error))

            # custom voice
            if custom_voices:
                prog_disp(
                    "Applying customized voices...",
                    0.90,
                    is_gui,
                    progress=progress,
                )

                try:
                    self.vci(
                        audio_files,
                        speakers_list,
                        overwrite=True,
                        parallel_workers=custom_voices_workers,
                    )
                    self.vci.unload_models()
                except Exception as error:
                    logger.error(str(error))

            prog_disp(
                "Creating final translated video...",
                0.95,
                is_gui,
                progress=progress,
            )
            remove_files(dub_audio_file)
            create_translated_audio(
                self.result_diarize,
                audio_files,
                dub_audio_file,
                False,
                avoid_overlap,
            )

        # Voiceless track, change with file
        hash_base_audio_wav = get_hash(base_audio_wav)
        if voiceless_track:
            if self.voiceless_id != hash_base_audio_wav:
                from soni_translate.mdx_net import process_uvr_task

                try:
                    # voiceless_audio_file_dir = "clean_song_output/voiceless"
                    remove_files(voiceless_audio_file)
                    uvr_voiceless_audio_wav, _ = process_uvr_task(
                        orig_song_path=base_audio_wav,
                        song_id="voiceless",
                        only_voiceless=True,
                        remove_files_output_dir=False,
                    )
                    copy_files(uvr_voiceless_audio_wav, ".")
                    base_audio_wav = voiceless_audio_file
                    self.voiceless_id = hash_base_audio_wav

                except Exception as error:
                    logger.error(str(error))
            else:
                base_audio_wav = voiceless_audio_file

        if not self.task_in_cache("mix_aud", [
            mix_method_audio,
            volume_original_audio,
            volume_translated_audio,
            voiceless_track
        ], {}):
            # TYPE MIX AUDIO
            remove_files(mix_audio_file)
            command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}'
            command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}'
            if mix_method_audio == "Adjusting volumes and mixing audio":
                # volume mix
                run_command(command_volume_mix)
            else:
                try:
                    # background mix
                    run_command(command_background_mix)
                except Exception as error_mix:
                    # volume mix except
                    logger.error(str(error_mix))
                    run_command(command_volume_mix)

        if "audio" in output_type or is_audio_file(media_file):
            output = media_out(
                media_file,
                TRANSLATE_AUDIO_TO,
                video_output_name,
                "wav" if "wav" in output_type else (
                    "ogg" if "ogg" in output_type else "mp3"
                ),
                file_obj=mix_audio_file,
                subtitle_files=output_format_subtitle,
            )
            msg_out = output[0] if isinstance(output, list) else output
            logger.info(f"Done: {msg_out}")
            return output

        hash_base_video_file = get_hash(base_video_file)

        if burn_subtitles_to_video:
            hashvideo_text = [
                hash_base_video_file,
                [seg["text"] for seg in self.result_diarize["segments"]]
            ]
            if self.burn_subs_id != hashvideo_text:
                try:
                    logger.info("Burn subtitles")
                    remove_files(vid_subs)
                    command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}"
                    run_command(command)
                    base_video_file = vid_subs
                    self.burn_subs_id = hashvideo_text
                except Exception as error:
                    logger.error(str(error))
            else:
                base_video_file = vid_subs

        if not self.task_in_cache("output", [
            hash_base_video_file,
            hash_base_audio_wav,
            burn_subtitles_to_video
        ], {}):
            # Merge new audio + video
            remove_files(video_output_file)
            run_command(
                f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}"
            )

        output = media_out(
            media_file,
            TRANSLATE_AUDIO_TO,
            video_output_name,
            "mkv" if "mkv" in output_type else "mp4",
            file_obj=video_output_file,
            soft_subtitles=soft_subtitles_to_video,
            subtitle_files=output_format_subtitle,
        )
        msg_out = output[0] if isinstance(output, list) else output
        logger.info(f"Done: {msg_out}")

        return output

    def hook_beta_processor(
        self,
        document,
        tgt_lang,
        translate_process,
        ori_lang,
        tts,
        name_final_file,
        custom_voices,
        custom_voices_workers,
        output_type,
        chunk_size,
        width,
        height,
        start_page,
        end_page,
        bcolor,
        is_gui,
        progress
    ):
        prog_disp("Processing pages...", 0.10, is_gui, progress=progress)
        doc_data = doc_to_txtximg_pages(document,  width, height, start_page, end_page, bcolor)
        result_diarize = page_data_to_segments(doc_data, 1700)

        prog_disp("Translating...", 0.20, is_gui, progress=progress)
        result_diarize["segments"] = translate_text(
            result_diarize["segments"],
            tgt_lang,
            translate_process,
            chunk_size=0,
            source=ori_lang,
        )
        chunk_size = (
            chunk_size if chunk_size else determine_chunk_size(tts)
        )
        doc_data = update_page_data(result_diarize, doc_data)

        prog_disp("Text to speech...", 0.30, is_gui, progress=progress)
        result_diarize = page_data_to_segments(doc_data, chunk_size)
        valid_speakers = audio_segmentation_to_voice(
            result_diarize,
            tgt_lang,
            is_gui,
            tts,
        )

        # fix format and set folder output
        audio_files, speakers_list = accelerate_segments(
                result_diarize,
                1.0,
                valid_speakers,
            )

        # custom voice
        if custom_voices:
            prog_disp(
                "Applying customized voices...",
                0.60,
                is_gui,
                progress=progress,
            )
            self.vci(
                audio_files,
                speakers_list,
                overwrite=True,
                parallel_workers=custom_voices_workers,
            )
            self.vci.unload_models()

        # Update time segments and not concat
        result_diarize = fix_timestamps_docs(result_diarize, audio_files)
        final_wav_file = "audio_book.wav"
        remove_files(final_wav_file)

        prog_disp("Creating audio file...", 0.70, is_gui, progress=progress)
        create_translated_audio(
            result_diarize, audio_files, final_wav_file, False
        )

        prog_disp("Creating video file...", 0.80, is_gui, progress=progress)
        video_doc = create_video_from_images(
                doc_data,
                result_diarize
        )

        # Merge video and audio
        prog_disp("Merging...", 0.90, is_gui, progress=progress)
        vid_out = merge_video_and_audio(video_doc, final_wav_file)

        # End
        output = media_out(
            document,
            tgt_lang,
            name_final_file,
            "mkv" if "mkv" in output_type else "mp4",
            file_obj=vid_out,
        )
        logger.info(f"Done: {output}")
        return output

    def multilingual_docs_conversion(
        self,
        string_text="",  # string
        document=None,  # doc path gui
        directory_input="",  # doc path
        origin_language="English (en)",
        target_language="English (en)",
        tts_voice00="en-US-EmmaMultilingualNeural-Female",
        name_final_file="",
        translate_process="google_translator",
        output_type="audio",
        chunk_size=None,
        custom_voices=False,
        custom_voices_workers=1,
        start_page=1,
        end_page=99999,
        width=1280,
        height=720,
        bcolor="dynamic",
        is_gui=False,
        progress=gr.Progress(),
    ):
        if "gpt" in translate_process:
            check_openai_api_key()

        SOURCE_LANGUAGE = LANGUAGES[origin_language]
        if translate_process != "disable_translation":
            TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
        else:
            TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE
            logger.info("No translation")
        if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
            logger.debug(
                "Make sure to select a 'TTS Speaker' suitable for the "
                "translation language to avoid errors with the TTS."
            )

        self.clear_cache(string_text, force=True)

        is_string = False
        if document is None:
            if os.path.exists(directory_input):
                document = directory_input
            else:
                document = string_text
                is_string = True
        document = document if isinstance(document, str) else document.name
        if not document:
            raise Exception("No data found")

        if os.environ.get("IS_DEMO") == "TRUE" and not is_string:
            raise RuntimeError(
                "This option is disabled in this demo. "
                "Alternatively, you can install "
                "the app locally or use the Colab notebook available in"
                " the SoniTranslate repository."
            )

        if "videobook" in output_type:
            if not document.lower().endswith(".pdf"):
                raise ValueError(
                    "Videobooks are only compatible with PDF files."
                )

            return self.hook_beta_processor(
                document,
                TRANSLATE_AUDIO_TO,
                translate_process,
                SOURCE_LANGUAGE,
                tts_voice00,
                name_final_file,
                custom_voices,
                custom_voices_workers,
                output_type,
                chunk_size,
                width,
                height,
                start_page,
                end_page,
                bcolor,
                is_gui,
                progress
            )

        # audio_wav = "audio.wav"
        final_wav_file = "audio_book.wav"

        prog_disp("Processing text...", 0.15, is_gui, progress=progress)
        result_file_path, result_text = document_preprocessor(
            document, is_string, start_page, end_page
        )

        if (
            output_type == "book (txt)"
            and translate_process == "disable_translation"
        ):
            return result_file_path

        if "SET_LIMIT" == os.getenv("DEMO"):
            result_text = result_text[:50]
            logger.info(
                "DEMO; Generation is limited to 50 characters to prevent "
                "CPU errors. No limitations with GPU.\n"
            )

        if translate_process != "disable_translation":
            # chunks text for translation
            result_diarize = plain_text_to_segments(result_text, 1700)
            prog_disp("Translating...", 0.30, is_gui, progress=progress)
            # not or iterative with 1700 chars
            result_diarize["segments"] = translate_text(
                result_diarize["segments"],
                TRANSLATE_AUDIO_TO,
                translate_process,
                chunk_size=0,
                source=SOURCE_LANGUAGE,
            )

            txt_file_path, result_text = segments_to_plain_text(result_diarize)

            if output_type == "book (txt)":
                return media_out(
                    result_file_path if is_string else document,
                    TRANSLATE_AUDIO_TO,
                    name_final_file,
                    "txt",
                    file_obj=txt_file_path,
                )

        # (TTS limits) plain text to result_diarize
        chunk_size = (
            chunk_size if chunk_size else determine_chunk_size(tts_voice00)
        )
        result_diarize = plain_text_to_segments(result_text, chunk_size)
        logger.debug(result_diarize)

        prog_disp("Text to speech...", 0.45, is_gui, progress=progress)
        valid_speakers = audio_segmentation_to_voice(
            result_diarize,
            TRANSLATE_AUDIO_TO,
            is_gui,
            tts_voice00,
        )

        # fix format and set folder output
        audio_files, speakers_list = accelerate_segments(
                result_diarize,
                1.0,
                valid_speakers,
            )

        # custom voice
        if custom_voices:
            prog_disp(
                "Applying customized voices...",
                0.80,
                is_gui,
                progress=progress,
            )
            self.vci(
                audio_files,
                speakers_list,
                overwrite=True,
                parallel_workers=custom_voices_workers,
            )
            self.vci.unload_models()

        prog_disp(
            "Creating final audio file...", 0.90, is_gui, progress=progress
        )
        remove_files(final_wav_file)
        create_translated_audio(
            result_diarize, audio_files, final_wav_file, True
        )

        output = media_out(
            result_file_path if is_string else document,
            TRANSLATE_AUDIO_TO,
            name_final_file,
            "mp3" if "mp3" in output_type else (
                "ogg" if "ogg" in output_type else "wav"
            ),
            file_obj=final_wav_file,
        )

        logger.info(f"Done: {output}")

        return output


title = "<center><strong><font size='7'>📽️ SoniTranslate 🈷️</font></strong></center>"


def create_gui(theme, logs_in_gui=False):
    with gr.Blocks(theme=theme) as app:
        gr.Markdown(title)
        gr.Markdown(lg_conf["description"])

        if os.environ.get("ZERO_GPU") == "TRUE":
            gr.Markdown(
                """
            
                <details>
                    <summary style="font-size: 1.5em;">⚠️ Important ⚠️</summary>
                    <ul>
                        <li>🚀 This demo uses a zero GPU setup only for the transcription and diarization process. Everything else runs on the CPU. It is recommended to use videos no longer than 15 minutes. ⏳</li>
                        <li>❗ If you see `queue` when using this, it means another user is currently using it, and you need to wait until they are finished.</li>
                        <li>🔒 Some functions are disabled, but if you duplicate this with a GPU and set the value in secrets "ZERO_GPU" to FALSE, you can use the app with full GPU acceleration. ⚡</li>
                    </ul>
                </details>
                """
            )

        with gr.Tab(lg_conf["tab_translate"]):
            with gr.Row():
                with gr.Column():
                    input_data_type = gr.Dropdown(
                        ["SUBMIT VIDEO", "URL", "Find Video Path"],
                        value="SUBMIT VIDEO",
                        label=lg_conf["video_source"],
                    )

                    def swap_visibility(data_type):
                        if data_type == "URL":
                            return (
                                gr.update(visible=False, value=None),
                                gr.update(visible=True, value=""),
                                gr.update(visible=False, value=""),
                            )
                        elif data_type == "SUBMIT VIDEO":
                            return (
                                gr.update(visible=True, value=None),
                                gr.update(visible=False, value=""),
                                gr.update(visible=False, value=""),
                            )
                        elif data_type == "Find Video Path":
                            return (
                                gr.update(visible=False, value=None),
                                gr.update(visible=False, value=""),
                                gr.update(visible=True, value=""),
                            )

                    video_input = gr.File(
                        label="VIDEO",
                        file_count="multiple",
                        type="filepath",
                    )
                    blink_input = gr.Textbox(
                        visible=False,
                        label=lg_conf["link_label"],
                        info=lg_conf["link_info"],
                        placeholder=lg_conf["link_ph"],
                    )
                    directory_input = gr.Textbox(
                        visible=False,
                        label=lg_conf["dir_label"],
                        info=lg_conf["dir_info"],
                        placeholder=lg_conf["dir_ph"],
                    )
                    input_data_type.change(
                        fn=swap_visibility,
                        inputs=input_data_type,
                        outputs=[video_input, blink_input, directory_input],
                    )

                    gr.HTML()

                    SOURCE_LANGUAGE = gr.Dropdown(
                        LANGUAGES_LIST,
                        value=LANGUAGES_LIST[0],
                        label=lg_conf["sl_label"],
                        info=lg_conf["sl_info"],
                    )
                    TRANSLATE_AUDIO_TO = gr.Dropdown(
                        LANGUAGES_LIST[1:],
                        value="English (en)",
                        label=lg_conf["tat_label"],
                        info=lg_conf["tat_info"],
                    )

                    gr.HTML("<hr></h2>")

                    gr.Markdown(lg_conf["num_speakers"])
                    MAX_TTS = 12
                    min_speakers = gr.Slider(
                        1,
                        MAX_TTS,
                        value=1,
                        label=lg_conf["min_sk"],
                        step=1,
                        visible=False,
                    )
                    max_speakers = gr.Slider(
                        1,
                        MAX_TTS,
                        value=1,
                        step=1,
                        label=lg_conf["max_sk"],
                    )
                    gr.Markdown(lg_conf["tts_select"])

                    def submit(value):
                        visibility_dict = {
                            f"tts_voice{i:02d}": gr.update(visible=i < value)
                            for i in range(MAX_TTS)
                        }
                        return [value for value in visibility_dict.values()]

                    tts_voice00 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="en-US-EmmaMultilingualNeural-Female",
                        label=lg_conf["sk1"],
                        visible=True,
                        interactive=True,
                    )
                    tts_voice01 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="en-US-AndrewMultilingualNeural-Male",
                        label=lg_conf["sk2"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice02 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="en-US-AvaMultilingualNeural-Female",
                        label=lg_conf["sk3"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice03 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="en-US-BrianMultilingualNeural-Male",
                        label=lg_conf["sk4"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice04 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="de-DE-SeraphinaMultilingualNeural-Female",
                        label=lg_conf["sk4"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice05 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="de-DE-FlorianMultilingualNeural-Male",
                        label=lg_conf["sk6"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice06 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="fr-FR-VivienneMultilingualNeural-Female",
                        label=lg_conf["sk7"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice07 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="fr-FR-RemyMultilingualNeural-Male",
                        label=lg_conf["sk8"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice08 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="en-US-EmmaMultilingualNeural-Female",
                        label=lg_conf["sk9"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice09 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="en-US-AndrewMultilingualNeural-Male",
                        label=lg_conf["sk10"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice10 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="en-US-EmmaMultilingualNeural-Female",
                        label=lg_conf["sk11"],
                        visible=False,
                        interactive=True,
                    )
                    tts_voice11 = gr.Dropdown(
                        SoniTr.tts_info.tts_list(),
                        value="en-US-AndrewMultilingualNeural-Male",
                        label=lg_conf["sk12"],
                        visible=False,
                        interactive=True,
                    )
                    max_speakers.change(
                        submit,
                        max_speakers,
                        [
                            tts_voice00,
                            tts_voice01,
                            tts_voice02,
                            tts_voice03,
                            tts_voice04,
                            tts_voice05,
                            tts_voice06,
                            tts_voice07,
                            tts_voice08,
                            tts_voice09,
                            tts_voice10,
                            tts_voice11,
                        ],
                    )

                    with gr.Column():
                        with gr.Accordion(
                            lg_conf["vc_title"],
                            open=False,
                        ):
                            gr.Markdown(lg_conf["vc_subtitle"])
                            voice_imitation_gui = gr.Checkbox(
                                False,
                                label=lg_conf["vc_active_label"],
                                info=lg_conf["vc_active_info"],
                            )
                            openvoice_models = ["openvoice", "openvoice_v2"]
                            voice_imitation_method_options = (
                                ["freevc"] + openvoice_models
                                if SoniTr.tts_info.xtts_enabled
                                else openvoice_models
                            )
                            voice_imitation_method_gui = gr.Dropdown(
                                voice_imitation_method_options,
                                value=voice_imitation_method_options[-1],
                                label=lg_conf["vc_method_label"],
                                info=lg_conf["vc_method_info"],
                            )
                            voice_imitation_max_segments_gui = gr.Slider(
                                label=lg_conf["vc_segments_label"],
                                info=lg_conf["vc_segments_info"],
                                value=3,
                                step=1,
                                minimum=1,
                                maximum=10,
                                visible=True,
                                interactive=True,
                            )
                            voice_imitation_vocals_dereverb_gui = gr.Checkbox(
                                False,
                                label=lg_conf["vc_dereverb_label"],
                                info=lg_conf["vc_dereverb_info"],
                            )
                            voice_imitation_remove_previous_gui = gr.Checkbox(
                                True,
                                label=lg_conf["vc_remove_label"],
                                info=lg_conf["vc_remove_info"],
                            )

                    if SoniTr.tts_info.xtts_enabled:
                        with gr.Column():
                            with gr.Accordion(
                                lg_conf["xtts_title"],
                                open=False,
                            ):
                                gr.Markdown(lg_conf["xtts_subtitle"])
                                wav_speaker_file = gr.File(
                                    label=lg_conf["xtts_file_label"]
                                )
                                wav_speaker_name = gr.Textbox(
                                    label=lg_conf["xtts_name_label"],
                                    value="",
                                    info=lg_conf["xtts_name_info"],
                                    placeholder="default_name",
                                    lines=1,
                                )
                                wav_speaker_start = gr.Number(
                                    label="Time audio start",
                                    value=0,
                                    visible=False,
                                )
                                wav_speaker_end = gr.Number(
                                    label="Time audio end",
                                    value=0,
                                    visible=False,
                                )
                                wav_speaker_dir = gr.Textbox(
                                    label="Directory save",
                                    value="_XTTS_",
                                    visible=False,
                                )
                                wav_speaker_dereverb = gr.Checkbox(
                                    True,
                                    label=lg_conf["xtts_dereverb_label"],
                                    info=lg_conf["xtts_dereverb_info"]
                                )
                                wav_speaker_output = gr.HTML()
                                create_xtts_wav = gr.Button(
                                    lg_conf["xtts_button"]
                                )
                                gr.Markdown(lg_conf["xtts_footer"])
                    else:
                        wav_speaker_dereverb = gr.Checkbox(
                            False,
                            label=lg_conf["xtts_dereverb_label"],
                            info=lg_conf["xtts_dereverb_info"],
                            visible=False
                        )

                    with gr.Column():
                        with gr.Accordion(
                            lg_conf["extra_setting"], open=False
                        ):
                            audio_accelerate = gr.Slider(
                                label=lg_conf["acc_max_label"],
                                value=1.9,
                                step=0.1,
                                minimum=1.0,
                                maximum=2.5,
                                visible=True,
                                interactive=True,
                                info=lg_conf["acc_max_info"],
                            )
                            acceleration_rate_regulation_gui = gr.Checkbox(
                                False,
                                label=lg_conf["acc_rate_label"],
                                info=lg_conf["acc_rate_info"],
                            )
                            avoid_overlap_gui = gr.Checkbox(
                                False,
                                label=lg_conf["or_label"],
                                info=lg_conf["or_info"],
                            )

                            gr.HTML("<hr></h2>")

                            audio_mix_options = [
                                "Mixing audio with sidechain compression",
                                "Adjusting volumes and mixing audio",
                            ]
                            AUDIO_MIX = gr.Dropdown(
                                audio_mix_options,
                                value=audio_mix_options[1],
                                label=lg_conf["aud_mix_label"],
                                info=lg_conf["aud_mix_info"],
                            )
                            volume_original_mix = gr.Slider(
                                label=lg_conf["vol_ori"],
                                info="for Adjusting volumes and mixing audio",
                                value=0.25,
                                step=0.05,
                                minimum=0.0,
                                maximum=2.50,
                                visible=True,
                                interactive=True,
                            )
                            volume_translated_mix = gr.Slider(
                                label=lg_conf["vol_tra"],
                                info="for Adjusting volumes and mixing audio",
                                value=1.80,
                                step=0.05,
                                minimum=0.0,
                                maximum=2.50,
                                visible=True,
                                interactive=True,
                            )
                            main_voiceless_track = gr.Checkbox(
                                label=lg_conf["voiceless_tk_label"],
                                info=lg_conf["voiceless_tk_info"],
                            )

                            gr.HTML("<hr></h2>")
                            sub_type_options = [
                                "disable",
                                "srt",
                                "vtt",
                                "ass",
                                "txt",
                                "tsv",
                                "json",
                                "aud",
                            ]

                            sub_type_output = gr.Dropdown(
                                sub_type_options,
                                value=sub_type_options[1],
                                label=lg_conf["sub_type"],
                            )
                            soft_subtitles_to_video_gui = gr.Checkbox(
                                label=lg_conf["soft_subs_label"],
                                info=lg_conf["soft_subs_info"],
                            )
                            burn_subtitles_to_video_gui = gr.Checkbox(
                                label=lg_conf["burn_subs_label"],
                                info=lg_conf["burn_subs_info"],
                            )

                            gr.HTML("<hr></h2>")
                            gr.Markdown(lg_conf["whisper_title"])
                            literalize_numbers_gui = gr.Checkbox(
                                True,
                                label=lg_conf["lnum_label"],
                                info=lg_conf["lnum_info"],
                            )
                            vocal_refinement_gui = gr.Checkbox(
                                False,
                                label=lg_conf["scle_label"],
                                info=lg_conf["scle_info"],
                            )
                            segment_duration_limit_gui = gr.Slider(
                                label=lg_conf["sd_limit_label"],
                                info=lg_conf["sd_limit_info"],
                                value=15,
                                step=1,
                                minimum=1,
                                maximum=30,
                            )
                            whisper_model_default = (
                                "large-v3"
                                if SoniTr.device == "cuda"
                                else "medium"
                            )

                            WHISPER_MODEL_SIZE = gr.Dropdown(
                                ASR_MODEL_OPTIONS + find_whisper_models(),
                                value=whisper_model_default,
                                label="Whisper ASR model",
                                info=lg_conf["asr_model_info"],
                                allow_custom_value=True,
                            )
                            com_t_opt, com_t_default = (
                                [COMPUTE_TYPE_GPU, "float16"]
                                if SoniTr.device == "cuda"
                                else [COMPUTE_TYPE_CPU, "float32"]
                            )
                            compute_type = gr.Dropdown(
                                com_t_opt,
                                value=com_t_default,
                                label=lg_conf["ctype_label"],
                                info=lg_conf["ctype_info"],
                            )
                            batch_size_value = 8 if os.environ.get("ZERO_GPU") != "TRUE" else 32
                            batch_size = gr.Slider(
                                minimum=1,
                                maximum=32,
                                value=batch_size_value,
                                label=lg_conf["batchz_label"],
                                info=lg_conf["batchz_info"],
                                step=1,
                            )
                            input_srt = gr.File(
                                label=lg_conf["srt_file_label"],
                                file_types=[".srt", ".ass", ".vtt"],
                                height=130,
                            )

                            gr.HTML("<hr></h2>")
                            text_segmentation_options = [
                                "sentence",
                                "word",
                                "character"
                            ]
                            text_segmentation_scale_gui = gr.Dropdown(
                                text_segmentation_options,
                                value=text_segmentation_options[0],
                                label=lg_conf["tsscale_label"],
                                info=lg_conf["tsscale_info"],
                            )
                            divide_text_segments_by_gui = gr.Textbox(
                                label=lg_conf["divide_text_label"],
                                value="",
                                info=lg_conf["divide_text_info"],
                            )

                            gr.HTML("<hr></h2>")
                            pyannote_models_list = list(
                                diarization_models.keys()
                            )
                            diarization_process_dropdown = gr.Dropdown(
                                pyannote_models_list,
                                value=pyannote_models_list[1],
                                label=lg_conf["diarization_label"],
                            )
                            translate_process_dropdown = gr.Dropdown(
                                TRANSLATION_PROCESS_OPTIONS,
                                value=TRANSLATION_PROCESS_OPTIONS[0],
                                label=lg_conf["tr_process_label"],
                            )

                            gr.HTML("<hr></h2>")
                            main_output_type = gr.Dropdown(
                                OUTPUT_TYPE_OPTIONS,
                                value=OUTPUT_TYPE_OPTIONS[0],
                                label=lg_conf["out_type_label"],
                            )
                            VIDEO_OUTPUT_NAME = gr.Textbox(
                                label=lg_conf["out_name_label"],
                                value="",
                                info=lg_conf["out_name_info"],
                            )
                            play_sound_gui = gr.Checkbox(
                                True,
                                label=lg_conf["task_sound_label"],
                                info=lg_conf["task_sound_info"],
                            )
                            enable_cache_gui = gr.Checkbox(
                                True,
                                label=lg_conf["cache_label"],
                                info=lg_conf["cache_info"],
                            )
                            PREVIEW = gr.Checkbox(
                                label="Preview", info=lg_conf["preview_info"]
                            )
                            is_gui_dummy_check = gr.Checkbox(
                                True, visible=False
                            )

                with gr.Column(variant="compact"):
                    edit_sub_check = gr.Checkbox(
                        label=lg_conf["edit_sub_label"],
                        info=lg_conf["edit_sub_info"],
                        interactive=(False if os.environ.get("IS_DEMO") == "TRUE" else True),
                    )
                    dummy_false_check = gr.Checkbox(
                        False,
                        visible=False,
                    )

                    def visible_component_subs(input_bool):
                        if input_bool:
                            return gr.update(visible=True), gr.update(
                                visible=True
                            )
                        else:
                            return gr.update(visible=False), gr.update(
                                visible=False
                            )

                    subs_button = gr.Button(
                        lg_conf["button_subs"],
                        variant="primary",
                        visible=False,
                    )
                    subs_edit_space = gr.Textbox(
                        visible=False,
                        lines=10,
                        label=lg_conf["editor_sub_label"],
                        info=lg_conf["editor_sub_info"],
                        placeholder=lg_conf["editor_sub_ph"],
                    )
                    edit_sub_check.change(
                        visible_component_subs,
                        [edit_sub_check],
                        [subs_button, subs_edit_space],
                    )

                    with gr.Row():
                        video_button = gr.Button(
                            lg_conf["button_translate"],
                            variant="primary",
                        )
                    with gr.Row():
                        video_output = gr.File(
                            label=lg_conf["output_result_label"],
                            file_count="multiple",
                            interactive=False,

                        )  # gr.Video()

                    gr.HTML("<hr></h2>")

                    if (
                        os.getenv("YOUR_HF_TOKEN") is None
                        or os.getenv("YOUR_HF_TOKEN") == ""
                    ):
                        HFKEY = gr.Textbox(
                            visible=True,
                            label="HF Token",
                            info=lg_conf["ht_token_info"],
                            placeholder=lg_conf["ht_token_ph"],
                        )
                    else:
                        HFKEY = gr.Textbox(
                            visible=False,
                            label="HF Token",
                            info=lg_conf["ht_token_info"],
                            placeholder=lg_conf["ht_token_ph"],
                        )

                    gr.Examples(
                        examples=[
                            [
                                ["./assets/Video_main.mp4"],
                                "",
                                "",
                                "",
                                False,
                                whisper_model_default,
                                batch_size_value,
                                com_t_default,
                                "Spanish (es)",
                                "English (en)",
                                1,
                                2,
                                "en-US-EmmaMultilingualNeural-Female",
                                "en-US-AndrewMultilingualNeural-Male",
                            ],
                        ],  # no update
                        fn=SoniTr.batch_multilingual_media_conversion,
                        inputs=[
                            video_input,
                            blink_input,
                            directory_input,
                            HFKEY,
                            PREVIEW,
                            WHISPER_MODEL_SIZE,
                            batch_size,
                            compute_type,
                            SOURCE_LANGUAGE,
                            TRANSLATE_AUDIO_TO,
                            min_speakers,
                            max_speakers,
                            tts_voice00,
                            tts_voice01,
                        ],
                        outputs=[video_output],
                        cache_examples=False,
                    )

        with gr.Tab(lg_conf["tab_docs"]):
            with gr.Column():
                with gr.Accordion("Docs", open=True):
                    with gr.Column(variant="compact"):
                        with gr.Column():
                            input_doc_type = gr.Dropdown(
                                [
                                    "WRITE TEXT",
                                    "SUBMIT DOCUMENT",
                                    "Find Document Path",
                                ],
                                value="SUBMIT DOCUMENT",
                                label=lg_conf["docs_input_label"],
                                info=lg_conf["docs_input_info"],
                            )

                            def swap_visibility(data_type):
                                if data_type == "WRITE TEXT":
                                    return (
                                        gr.update(visible=True, value=""),
                                        gr.update(visible=False, value=None),
                                        gr.update(visible=False, value=""),
                                    )
                                elif data_type == "SUBMIT DOCUMENT":
                                    return (
                                        gr.update(visible=False, value=""),
                                        gr.update(visible=True, value=None),
                                        gr.update(visible=False, value=""),
                                    )
                                elif data_type == "Find Document Path":
                                    return (
                                        gr.update(visible=False, value=""),
                                        gr.update(visible=False, value=None),
                                        gr.update(visible=True, value=""),
                                    )

                            text_docs = gr.Textbox(
                                label="Text",
                                value="This is an example",
                                info="Write a text",
                                placeholder="...",
                                lines=5,
                                visible=False,
                            )
                            input_docs = gr.File(
                                label="Document", visible=True
                            )
                            directory_input_docs = gr.Textbox(
                                visible=False,
                                label="Document Path",
                                info="Example: /home/my_doc.pdf",
                                placeholder="Path goes here...",
                            )
                            input_doc_type.change(
                                fn=swap_visibility,
                                inputs=input_doc_type,
                                outputs=[
                                    text_docs,
                                    input_docs,
                                    directory_input_docs,
                                ],
                            )

                            gr.HTML()

                            tts_documents = gr.Dropdown(
                                list(
                                    filter(
                                        lambda x: x != "_XTTS_/AUTOMATIC.wav",
                                        SoniTr.tts_info.tts_list(),
                                    )
                                ),
                                value="en-US-EmmaMultilingualNeural-Female",
                                label="TTS",
                                visible=True,
                                interactive=True,
                            )

                            gr.HTML()

                            docs_SOURCE_LANGUAGE = gr.Dropdown(
                                LANGUAGES_LIST[1:],
                                value="English (en)",
                                label=lg_conf["sl_label"],
                                info=lg_conf["docs_source_info"],
                            )
                            docs_TRANSLATE_TO = gr.Dropdown(
                                LANGUAGES_LIST[1:],
                                value="English (en)",
                                label=lg_conf["tat_label"],
                                info=lg_conf["tat_info"],
                            )

                            with gr.Column():
                                with gr.Accordion(
                                    lg_conf["extra_setting"], open=False
                                ):
                                    docs_translate_process_dropdown = gr.Dropdown(
                                        DOCS_TRANSLATION_PROCESS_OPTIONS,
                                        value=DOCS_TRANSLATION_PROCESS_OPTIONS[
                                            0
                                        ],
                                        label="Translation process",
                                    )

                                    gr.HTML("<hr></h2>")

                                    docs_output_type = gr.Dropdown(
                                        DOCS_OUTPUT_TYPE_OPTIONS,
                                        value=DOCS_OUTPUT_TYPE_OPTIONS[2],
                                        label="Output type",
                                    )
                                    docs_OUTPUT_NAME = gr.Textbox(
                                        label="Final file name",
                                        value="",
                                        info=lg_conf["out_name_info"],
                                    )
                                    docs_chunk_size = gr.Number(
                                        label=lg_conf["chunk_size_label"],
                                        value=0,
                                        visible=True,
                                        interactive=True,
                                        info=lg_conf["chunk_size_info"],
                                    )
                                    gr.HTML("<hr></h2>")
                                    start_page_gui = gr.Number(
                                        step=1,
                                        value=1,
                                        minimum=1,
                                        maximum=99999,
                                        label="Start page",
                                    )
                                    end_page_gui = gr.Number(
                                        step=1,
                                        value=99999,
                                        minimum=1,
                                        maximum=99999,
                                        label="End page",
                                    )
                                    gr.HTML("<hr>Videobook config</h2>")
                                    videobook_width_gui = gr.Number(
                                        step=1,
                                        value=1280,
                                        minimum=100,
                                        maximum=4096,
                                        label="Width",
                                    )
                                    videobook_height_gui = gr.Number(
                                        step=1,
                                        value=720,
                                        minimum=100,
                                        maximum=4096,
                                        label="Height",
                                    )
                                    videobook_bcolor_gui = gr.Dropdown(
                                        BORDER_COLORS,
                                        value=BORDER_COLORS[0],
                                        label="Border color",
                                    )
                                    docs_dummy_check = gr.Checkbox(
                                        True, visible=False
                                    )

                            with gr.Row():
                                docs_button = gr.Button(
                                    lg_conf["docs_button"],
                                    variant="primary",
                                )
                            with gr.Row():
                                docs_output = gr.File(
                                    label="Result",
                                    interactive=False,
                                )

        with gr.Tab("Custom voice R.V.C. (Optional)"):

            with gr.Column():
                with gr.Accordion("Get the R.V.C. Models", open=True):
                    url_links = gr.Textbox(
                        label="URLs",
                        value="",
                        info=lg_conf["cv_url_info"],
                        placeholder="urls here...",
                        lines=1,
                    )
                    download_finish = gr.HTML()
                    download_button = gr.Button("DOWNLOAD MODELS")

                    def update_models():
                        models_path, index_path = upload_model_list()

                        dict_models = {
                            f"fmodel{i:02d}": gr.update(
                                choices=models_path
                            )
                            for i in range(MAX_TTS+1)
                        }
                        dict_index = {
                            f"findex{i:02d}": gr.update(
                                choices=index_path, value=None
                            )
                            for i in range(MAX_TTS+1)
                        }
                        dict_changes = {**dict_models, **dict_index}
                        return [value for value in dict_changes.values()]

            with gr.Column():
                with gr.Accordion(lg_conf["replace_title"], open=False):
                    with gr.Column(variant="compact"):
                        with gr.Column():
                            gr.Markdown(lg_conf["sec1_title"])
                            enable_custom_voice = gr.Checkbox(
                                False,
                                label="ENABLE",
                                info=lg_conf["enable_replace"]
                            )
                            workers_custom_voice = gr.Number(
                                step=1,
                                value=1,
                                minimum=1,
                                maximum=50,
                                label="workers",
                                visible=False,
                            )

                            gr.Markdown(lg_conf["sec2_title"])
                            gr.Markdown(lg_conf["sec2_subtitle"])

                            PITCH_ALGO_OPT = [
                                "pm",
                                "harvest",
                                "crepe",
                                "rmvpe",
                                "rmvpe+",
                            ]

                            def model_conf():
                                return gr.Dropdown(
                                    models_path,
                                    # value="",
                                    label="Model",
                                    visible=True,
                                    interactive=True,
                                )

                            def pitch_algo_conf():
                                return gr.Dropdown(
                                    PITCH_ALGO_OPT,
                                    value=PITCH_ALGO_OPT[3],
                                    label="Pitch algorithm",
                                    visible=True,
                                    interactive=True,
                                )

                            def pitch_lvl_conf():
                                return gr.Slider(
                                    label="Pitch level",
                                    minimum=-24,
                                    maximum=24,
                                    step=1,
                                    value=0,
                                    visible=True,
                                    interactive=True,
                                )

                            def index_conf():
                                return gr.Dropdown(
                                    index_path,
                                    value=None,
                                    label="Index",
                                    visible=True,
                                    interactive=True,
                                )

                            def index_inf_conf():
                                return gr.Slider(
                                    minimum=0,
                                    maximum=1,
                                    label="Index influence",
                                    value=0.75,
                                )

                            def respiration_filter_conf():
                                return gr.Slider(
                                    minimum=0,
                                    maximum=7,
                                    label="Respiration median filtering",
                                    value=3,
                                    step=1,
                                    interactive=True,
                                )

                            def envelope_ratio_conf():
                                return gr.Slider(
                                    minimum=0,
                                    maximum=1,
                                    label="Envelope ratio",
                                    value=0.25,
                                    interactive=True,
                                )

                            def consonant_protec_conf():
                                return gr.Slider(
                                    minimum=0,
                                    maximum=0.5,
                                    label="Consonant breath protection",
                                    value=0.5,
                                    interactive=True,
                                )

                            def button_conf(tts_name):
                                return gr.Button(
                                    lg_conf["cv_button_apply"]+" "+tts_name,
                                    variant="primary",
                                )

                            TTS_TABS = [
                                'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1)
                            ]

                            CV_SUBTITLES = [
                                lg_conf["cv_tts1"],
                                lg_conf["cv_tts2"],
                                lg_conf["cv_tts3"],
                                lg_conf["cv_tts4"],
                                lg_conf["cv_tts5"],
                                lg_conf["cv_tts6"],
                                lg_conf["cv_tts7"],
                                lg_conf["cv_tts8"],
                                lg_conf["cv_tts9"],
                                lg_conf["cv_tts10"],
                                lg_conf["cv_tts11"],
                                lg_conf["cv_tts12"],
                            ]

                            configs_storage = []

                            for i in range(MAX_TTS):  # Loop from 00 to 11
                                with gr.Accordion(CV_SUBTITLES[i], open=False):
                                    gr.Markdown(TTS_TABS[i])
                                    with gr.Column():
                                        tag_gui = gr.Textbox(
                                            value=TTS_TABS[i], visible=False
                                        )
                                        model_gui = model_conf()
                                        pitch_algo_gui = pitch_algo_conf()
                                        pitch_lvl_gui = pitch_lvl_conf()
                                        index_gui = index_conf()
                                        index_inf_gui = index_inf_conf()
                                        rmf_gui = respiration_filter_conf()
                                        er_gui = envelope_ratio_conf()
                                        cbp_gui = consonant_protec_conf()

                                        with gr.Row(variant="compact"):
                                            button_config = button_conf(
                                                TTS_TABS[i]
                                            )

                                            confirm_conf = gr.HTML()

                                        button_config.click(
                                            SoniTr.vci.apply_conf,
                                            inputs=[
                                                tag_gui,
                                                model_gui,
                                                pitch_algo_gui,
                                                pitch_lvl_gui,
                                                index_gui,
                                                index_inf_gui,
                                                rmf_gui,
                                                er_gui,
                                                cbp_gui,
                                            ],
                                            outputs=[confirm_conf],
                                        )

                                        configs_storage.append({
                                            "tag": tag_gui,
                                            "model": model_gui,
                                            "index": index_gui,
                                        })

                with gr.Column():
                    with gr.Accordion("Test R.V.C.", open=False):
                        with gr.Row(variant="compact"):
                            text_test = gr.Textbox(
                                label="Text",
                                value="This is an example",
                                info="write a text",
                                placeholder="...",
                                lines=5,
                            )
                            with gr.Column():
                                tts_test = gr.Dropdown(
                                    sorted(SoniTr.tts_info.list_edge),
                                    value="en-GB-ThomasNeural-Male",
                                    label="TTS",
                                    visible=True,
                                    interactive=True,
                                )
                                model_test = model_conf()
                                index_test = index_conf()
                                pitch_test = pitch_lvl_conf()
                                pitch_alg_test = pitch_algo_conf()
                        with gr.Row(variant="compact"):
                            button_test = gr.Button("Test audio")

                        with gr.Column():
                            with gr.Row():
                                original_ttsvoice = gr.Audio()
                                ttsvoice = gr.Audio()

                            button_test.click(
                                SoniTr.vci.make_test,
                                inputs=[
                                    text_test,
                                    tts_test,
                                    model_test,
                                    index_test,
                                    pitch_test,
                                    pitch_alg_test,
                                ],
                                outputs=[ttsvoice, original_ttsvoice],
                            )

                    download_button.click(
                        download_list,
                        [url_links],
                        [download_finish],
                        queue=False
                    ).then(
                        update_models,
                        [],
                        [
                            elem["model"] for elem in configs_storage
                        ] + [model_test] + [
                            elem["index"] for elem in configs_storage
                        ] + [index_test],
                    )

        with gr.Tab(lg_conf["tab_help"]):
            gr.Markdown(lg_conf["tutorial"])
            gr.Markdown(news)

            def play_sound_alert(play_sound):

                if not play_sound:
                    return None

                # silent_sound = "assets/empty_audio.mp3"
                sound_alert = "assets/sound_alert.mp3"

                time.sleep(0.25)
                # yield silent_sound
                yield None

                time.sleep(0.25)
                yield sound_alert

            sound_alert_notification = gr.Audio(
                value=None,
                type="filepath",
                format="mp3",
                autoplay=True,
                visible=False,
            )

        if logs_in_gui:
            logger.info("Logs in gui need public url")

            class Logger:
                def __init__(self, filename):
                    self.terminal = sys.stdout
                    self.log = open(filename, "w")

                def write(self, message):
                    self.terminal.write(message)
                    self.log.write(message)

                def flush(self):
                    self.terminal.flush()
                    self.log.flush()

                def isatty(self):
                    return False

            sys.stdout = Logger("output.log")

            def read_logs():
                sys.stdout.flush()
                with open("output.log", "r") as f:
                    return f.read()

            with gr.Accordion("Logs", open=False):
                logs = gr.Textbox(label=">>>")
                app.load(read_logs, None, logs, every=1)

        if SoniTr.tts_info.xtts_enabled:
            # Update tts list
            def update_tts_list():
                update_dict = {
                    f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list())
                    for i in range(MAX_TTS)
                }
                update_dict["tts_documents"] = gr.update(
                    choices=list(
                        filter(
                            lambda x: x != "_XTTS_/AUTOMATIC.wav",
                            SoniTr.tts_info.tts_list(),
                        )
                    )
                )
                return [value for value in update_dict.values()]

            create_xtts_wav.click(
                create_wav_file_vc,
                inputs=[
                    wav_speaker_name,
                    wav_speaker_file,
                    wav_speaker_start,
                    wav_speaker_end,
                    wav_speaker_dir,
                    wav_speaker_dereverb,
                ],
                outputs=[wav_speaker_output],
            ).then(
                update_tts_list,
                None,
                [
                    tts_voice00,
                    tts_voice01,
                    tts_voice02,
                    tts_voice03,
                    tts_voice04,
                    tts_voice05,
                    tts_voice06,
                    tts_voice07,
                    tts_voice08,
                    tts_voice09,
                    tts_voice10,
                    tts_voice11,
                    tts_documents,
                ],
            )

        # Run translate text
        subs_button.click(
            SoniTr.batch_multilingual_media_conversion,
            inputs=[
                video_input,
                blink_input,
                directory_input,
                HFKEY,
                PREVIEW,
                WHISPER_MODEL_SIZE,
                batch_size,
                compute_type,
                SOURCE_LANGUAGE,
                TRANSLATE_AUDIO_TO,
                min_speakers,
                max_speakers,
                tts_voice00,
                tts_voice01,
                tts_voice02,
                tts_voice03,
                tts_voice04,
                tts_voice05,
                tts_voice06,
                tts_voice07,
                tts_voice08,
                tts_voice09,
                tts_voice10,
                tts_voice11,
                VIDEO_OUTPUT_NAME,
                AUDIO_MIX,
                audio_accelerate,
                acceleration_rate_regulation_gui,
                volume_original_mix,
                volume_translated_mix,
                sub_type_output,
                edit_sub_check,  # TRUE BY DEFAULT
                dummy_false_check,  # dummy false
                subs_edit_space,
                avoid_overlap_gui,
                vocal_refinement_gui,
                literalize_numbers_gui,
                segment_duration_limit_gui,
                diarization_process_dropdown,
                translate_process_dropdown,
                input_srt,
                main_output_type,
                main_voiceless_track,
                voice_imitation_gui,
                voice_imitation_max_segments_gui,
                voice_imitation_vocals_dereverb_gui,
                voice_imitation_remove_previous_gui,
                voice_imitation_method_gui,
                wav_speaker_dereverb,
                text_segmentation_scale_gui,
                divide_text_segments_by_gui,
                soft_subtitles_to_video_gui,
                burn_subtitles_to_video_gui,
                enable_cache_gui,
                enable_custom_voice,
                workers_custom_voice,
                is_gui_dummy_check,
            ],
            outputs=subs_edit_space,
        ).then(
            play_sound_alert, [play_sound_gui], [sound_alert_notification]
        )

        # Run translate tts and complete
        video_button.click(
            SoniTr.batch_multilingual_media_conversion,
            inputs=[
                video_input,
                blink_input,
                directory_input,
                HFKEY,
                PREVIEW,
                WHISPER_MODEL_SIZE,
                batch_size,
                compute_type,
                SOURCE_LANGUAGE,
                TRANSLATE_AUDIO_TO,
                min_speakers,
                max_speakers,
                tts_voice00,
                tts_voice01,
                tts_voice02,
                tts_voice03,
                tts_voice04,
                tts_voice05,
                tts_voice06,
                tts_voice07,
                tts_voice08,
                tts_voice09,
                tts_voice10,
                tts_voice11,
                VIDEO_OUTPUT_NAME,
                AUDIO_MIX,
                audio_accelerate,
                acceleration_rate_regulation_gui,
                volume_original_mix,
                volume_translated_mix,
                sub_type_output,
                dummy_false_check,
                edit_sub_check,
                subs_edit_space,
                avoid_overlap_gui,
                vocal_refinement_gui,
                literalize_numbers_gui,
                segment_duration_limit_gui,
                diarization_process_dropdown,
                translate_process_dropdown,
                input_srt,
                main_output_type,
                main_voiceless_track,
                voice_imitation_gui,
                voice_imitation_max_segments_gui,
                voice_imitation_vocals_dereverb_gui,
                voice_imitation_remove_previous_gui,
                voice_imitation_method_gui,
                wav_speaker_dereverb,
                text_segmentation_scale_gui,
                divide_text_segments_by_gui,
                soft_subtitles_to_video_gui,
                burn_subtitles_to_video_gui,
                enable_cache_gui,
                enable_custom_voice,
                workers_custom_voice,
                is_gui_dummy_check,
            ],
            outputs=video_output,
            trigger_mode="multiple",
        ).then(
            play_sound_alert, [play_sound_gui], [sound_alert_notification]
        )

        # Run docs process
        docs_button.click(
            SoniTr.multilingual_docs_conversion,
            inputs=[
                text_docs,
                input_docs,
                directory_input_docs,
                docs_SOURCE_LANGUAGE,
                docs_TRANSLATE_TO,
                tts_documents,
                docs_OUTPUT_NAME,
                docs_translate_process_dropdown,
                docs_output_type,
                docs_chunk_size,
                enable_custom_voice,
                workers_custom_voice,
                start_page_gui,
                end_page_gui,
                videobook_width_gui,
                videobook_height_gui,
                videobook_bcolor_gui,
                docs_dummy_check,
            ],
            outputs=docs_output,
            trigger_mode="multiple",
        ).then(
            play_sound_alert, [play_sound_gui], [sound_alert_notification]
        )

    return app


def get_language_config(language_data, language=None, base_key="english"):
    base_lang = language_data.get(base_key)

    if language not in language_data:
        logger.error(
            f"Language {language} not found, defaulting to {base_key}"
        )
        return base_lang

    lg_conf = language_data.get(language, {})
    lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf)

    return lg_conf


def create_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--theme",
        type=str,
        default="Taithrah/Minimal",
        help=(
            "Specify the theme; find themes in "
            "https://huggingface.co/spaces/gradio/theme-gallery;"
            " Example: --theme aliabid94/new-theme"
        ),
    )
    parser.add_argument(
        "--public_url",
        action="store_true",
        default=False,
        help="Enable public link",
    )
    parser.add_argument(
        "--logs_in_gui",
        action="store_true",
        default=False,
        help="Displays the operations performed in Logs",
    )
    parser.add_argument(
        "--verbosity_level",
        type=str,
        default="info",
        help=(
            "Set logger verbosity level: "
            "debug, info, warning, error, or critical"
        ),
    )
    parser.add_argument(
        "--language",
        type=str,
        default="english",
        help=" Select the language of the interface: english, spanish",
    )
    parser.add_argument(
        "--cpu_mode",
        action="store_true",
        default=False,
        help="Enable CPU mode to run the program without utilizing GPU acceleration.",
    )
    return parser


if __name__ == "__main__":

    parser = create_parser()

    args = parser.parse_args()
    # Simulating command-line arguments
    # args_list = "--theme aliabid94/new-theme --public_url".split()
    # args = parser.parse_args(args_list)

    set_logging_level(args.verbosity_level)

    for id_model in UVR_MODELS:
        download_manager(
            os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
        )

    models_path, index_path = upload_model_list()

    SoniTr = SoniTranslate(cpu_mode=args.cpu_mode if os.environ.get("ZERO_GPU") != "TRUE" else "cpu")

    lg_conf = get_language_config(language_data, language=args.language)

    app = create_gui(args.theme, logs_in_gui=args.logs_in_gui)

    app.queue()

    app.launch(
        max_threads=1,
        share=args.public_url,
        show_error=True,
        quiet=False,
        debug=(True if logger.isEnabledFor(logging.DEBUG) else False),
    )