diff --git "a/app_rvc.py" "b/app_rvc.py"
--- "a/app_rvc.py"
+++ "b/app_rvc.py"
@@ -1,2924 +1,2864 @@
-import gradio as gr
-import os
-os.system("pip install -q piper-tts==1.2.0")
-os.system("pip install -q -r requirements_xtts.txt")
-os.system("pip install -q TTS==0.21.1 --no-deps")
-import spaces
-import torch
-if os.environ.get("ZERO_GPU") != "TRUE" and torch.cuda.is_available():
- # onnxruntime GPU
- os.system("pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/")
-import librosa
-from soni_translate.logging_setup import (
- logger,
- set_logging_level,
- configure_logging_libs,
-); configure_logging_libs() # noqa
-import whisperx
-import os
-from soni_translate.audio_segments import create_translated_audio
-from soni_translate.text_to_speech import (
- audio_segmentation_to_voice,
- edge_tts_voices_list,
- coqui_xtts_voices_list,
- piper_tts_voices_list,
- create_wav_file_vc,
- accelerate_segments,
-)
-from soni_translate.translate_segments import (
- translate_text,
- TRANSLATION_PROCESS_OPTIONS,
- DOCS_TRANSLATION_PROCESS_OPTIONS
-)
-from soni_translate.preprocessor import (
- audio_video_preprocessor,
- audio_preprocessor,
-)
-from soni_translate.postprocessor import (
- OUTPUT_TYPE_OPTIONS,
- DOCS_OUTPUT_TYPE_OPTIONS,
- sound_separate,
- get_no_ext_filename,
- media_out,
- get_subtitle_speaker,
-)
-from soni_translate.language_configuration import (
- LANGUAGES,
- UNIDIRECTIONAL_L_LIST,
- LANGUAGES_LIST,
- BARK_VOICES_LIST,
- VITS_VOICES_LIST,
- OPENAI_TTS_MODELS,
-)
-from soni_translate.utils import (
- remove_files,
- download_list,
- upload_model_list,
- download_manager,
- run_command,
- is_audio_file,
- is_subtitle_file,
- copy_files,
- get_valid_files,
- get_link_list,
- remove_directory_contents,
-)
-from soni_translate.mdx_net import (
- UVR_MODELS,
- MDX_DOWNLOAD_LINK,
- mdxnet_models_dir,
-)
-from soni_translate.speech_segmentation import (
- ASR_MODEL_OPTIONS,
- COMPUTE_TYPE_GPU,
- COMPUTE_TYPE_CPU,
- find_whisper_models,
- transcribe_speech,
- align_speech,
- diarize_speech,
- diarization_models,
-)
-from soni_translate.text_multiformat_processor import (
- BORDER_COLORS,
- srt_file_to_segments,
- document_preprocessor,
- determine_chunk_size,
- plain_text_to_segments,
- segments_to_plain_text,
- process_subtitles,
- linguistic_level_segments,
- break_aling_segments,
- doc_to_txtximg_pages,
- page_data_to_segments,
- update_page_data,
- fix_timestamps_docs,
- create_video_from_images,
- merge_video_and_audio,
-)
-from soni_translate.languages_gui import language_data, news
-import copy
-import logging
-import json
-from pydub import AudioSegment
-from voice_main import ClassVoices
-import argparse
-import time
-import hashlib
-import sys
-
-directories = [
- "downloads",
- "logs",
- "weights",
- "clean_song_output",
- "_XTTS_",
- f"audio2{os.sep}audio",
- "audio",
- "outputs",
-]
-[
- os.makedirs(directory)
- for directory in directories
- if not os.path.exists(directory)
-]
-
-
-class TTS_Info:
- def __init__(self, piper_enabled, xtts_enabled):
- self.list_edge = edge_tts_voices_list()
- self.list_bark = list(BARK_VOICES_LIST.keys())
- self.list_vits = list(VITS_VOICES_LIST.keys())
- self.list_openai_tts = OPENAI_TTS_MODELS
- self.piper_enabled = piper_enabled
- self.list_vits_onnx = (
- piper_tts_voices_list() if self.piper_enabled else []
- )
- self.xtts_enabled = xtts_enabled
-
- def tts_list(self):
- self.list_coqui_xtts = (
- coqui_xtts_voices_list() if self.xtts_enabled else []
- )
- list_tts = self.list_coqui_xtts + sorted(
- self.list_edge
- + (self.list_bark if os.environ.get("ZERO_GPU") != "TRUE" else [])
- + self.list_vits
- + self.list_openai_tts
- + self.list_vits_onnx
- )
- return list_tts
-
-
-def prog_disp(msg, percent, is_gui, progress=None):
- logger.info(msg)
- if is_gui:
- progress(percent, desc=msg)
-
-
-def warn_disp(wrn_lang, is_gui):
- logger.warning(wrn_lang)
- if is_gui:
- gr.Warning(wrn_lang)
-
-
-class SoniTrCache:
- def __init__(self):
- self.cache = {
- 'media': [[]],
- 'refine_vocals': [],
- 'transcript_align': [],
- 'break_align': [],
- 'diarize': [],
- 'translate': [],
- 'subs_and_edit': [],
- 'tts': [],
- 'acc_and_vc': [],
- 'mix_aud': [],
- 'output': []
- }
-
- self.cache_data = {
- 'media': [],
- 'refine_vocals': [],
- 'transcript_align': [],
- 'break_align': [],
- 'diarize': [],
- 'translate': [],
- 'subs_and_edit': [],
- 'tts': [],
- 'acc_and_vc': [],
- 'mix_aud': [],
- 'output': []
- }
-
- self.cache_keys = list(self.cache.keys())
- self.first_task = self.cache_keys[0]
- self.last_task = self.cache_keys[-1]
-
- self.pre_step = None
- self.pre_params = []
-
- def set_variable(self, variable_name, value):
- setattr(self, variable_name, value)
-
- def task_in_cache(self, step: str, params: list, previous_step_data: dict):
-
- self.pre_step_cache = None
-
- if step == self.first_task:
- self.pre_step = None
-
- if self.pre_step:
- self.cache[self.pre_step] = self.pre_params
-
- # Fill data in cache
- self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data)
-
- self.pre_params = params
- # logger.debug(f"Step: {str(step)}, Cache params: {str(self.cache)}")
- if params == self.cache[step]:
- logger.debug(f"In cache: {str(step)}")
-
- # Set the var needed for next step
- # Recovery from cache_data the current step
- for key, value in self.cache_data[step].items():
- self.set_variable(key, copy.deepcopy(value))
- logger.debug(
- f"Chache load: {str(key)}"
- )
-
- self.pre_step = step
- return True
-
- else:
- logger.debug(f"Flush next and caching {str(step)}")
- selected_index = self.cache_keys.index(step)
-
- for idx, key in enumerate(self.cache.keys()):
- if idx >= selected_index:
- self.cache[key] = []
- self.cache_data[key] = {}
-
- # The last is now previous
- self.pre_step = step
- return False
-
- def clear_cache(self, media, force=False):
-
- self.cache["media"] = (
- self.cache["media"] if len(self.cache["media"]) else [[]]
- )
-
- if media != self.cache["media"][0] or force:
-
- # Clear cache
- self.cache = {key: [] for key in self.cache}
- self.cache["media"] = [[]]
-
- logger.info("Cache flushed")
-
-
-def get_hash(filepath):
- with open(filepath, 'rb') as f:
- file_hash = hashlib.blake2b()
- while chunk := f.read(8192):
- file_hash.update(chunk)
-
- return file_hash.hexdigest()[:18]
-
-
-def check_openai_api_key():
- if not os.environ.get("OPENAI_API_KEY"):
- raise ValueError(
- "To use GPT for translation, please set up your OpenAI API key "
- "as an environment variable in Linux as follows: "
- "export OPENAI_API_KEY='your-api-key-here'. Or change the "
- "translation process in Advanced settings."
- )
-
-
-class SoniTranslate(SoniTrCache):
- def __init__(self, cpu_mode=False):
- super().__init__()
- if cpu_mode:
- os.environ["SONITR_DEVICE"] = "cpu"
- else:
- os.environ["SONITR_DEVICE"] = (
- "cuda" if torch.cuda.is_available() else "cpu"
- )
-
- self.device = os.environ.get("SONITR_DEVICE")
- self.device = self.device if os.environ.get("ZERO_GPU") != "TRUE" else "cuda"
- self.result_diarize = None
- self.align_language = None
- self.result_source_lang = None
- self.edit_subs_complete = False
- self.voiceless_id = None
- self.burn_subs_id = None
-
- self.vci = ClassVoices(only_cpu=cpu_mode)
-
- self.tts_voices = self.get_tts_voice_list()
-
- logger.info(f"Working in: {self.device}")
-
- def get_tts_voice_list(self):
- try:
- from piper import PiperVoice # noqa
-
- piper_enabled = True
- logger.info("PIPER TTS enabled")
- except Exception as error:
- logger.debug(str(error))
- piper_enabled = False
- logger.info("PIPER TTS disabled")
- try:
- from TTS.api import TTS # noqa
-
- xtts_enabled = True
- logger.info("Coqui XTTS enabled")
- logger.info(
- "In this app, by using Coqui TTS (text-to-speech), you "
- "acknowledge and agree to the license.\n"
- "You confirm that you have read, understood, and agreed "
- "to the Terms and Conditions specified at the following "
- "link:\nhttps://coqui.ai/cpml.txt."
- )
- os.environ["COQUI_TOS_AGREED"] = "1"
- except Exception as error:
- logger.debug(str(error))
- xtts_enabled = False
- logger.info("Coqui XTTS disabled")
-
- self.tts_info = TTS_Info(piper_enabled, xtts_enabled)
-
- return self.tts_info.tts_list()
-
- def batch_multilingual_media_conversion(self, *kwargs):
- # logger.debug(str(kwargs))
-
- media_file_arg = kwargs[0] if kwargs[0] is not None else []
-
- link_media_arg = kwargs[1]
- link_media_arg = [x.strip() for x in link_media_arg.split(',')]
- link_media_arg = get_link_list(link_media_arg)
-
- path_arg = kwargs[2]
- path_arg = [x.strip() for x in path_arg.split(',')]
- path_arg = get_valid_files(path_arg)
-
- edit_text_arg = kwargs[31]
- get_text_arg = kwargs[32]
-
- is_gui_arg = kwargs[-1]
-
- kwargs = kwargs[3:]
-
- media_batch = media_file_arg + link_media_arg + path_arg
- media_batch = list(filter(lambda x: x != "", media_batch))
- media_batch = media_batch if media_batch else [None]
- logger.debug(str(media_batch))
-
- remove_directory_contents("outputs")
-
- if edit_text_arg or get_text_arg:
- return self.multilingual_media_conversion(
- media_batch[0], "", "", *kwargs
- )
-
- if "SET_LIMIT" == os.getenv("DEMO") or "TRUE" == os.getenv("ZERO_GPU"):
- media_batch = [media_batch[0]]
-
- result = []
- for media in media_batch:
- # Call the nested function with the parameters
- output_file = self.multilingual_media_conversion(
- media, "", "", *kwargs
- )
-
- if isinstance(output_file, str):
- output_file = [output_file]
- result.extend(output_file)
-
- if is_gui_arg and len(media_batch) > 1:
- gr.Info(f"Done: {os.path.basename(output_file[0])}")
-
- return result
-
- def multilingual_media_conversion(
- self,
- media_file=None,
- link_media="",
- directory_input="",
- YOUR_HF_TOKEN="",
- preview=False,
- transcriber_model="large-v3",
- batch_size=4,
- compute_type="auto",
- origin_language="Automatic detection",
- target_language="English (en)",
- min_speakers=1,
- max_speakers=1,
- tts_voice00="en-US-EmmaMultilingualNeural-Female",
- tts_voice01="en-US-AndrewMultilingualNeural-Male",
- tts_voice02="en-US-AvaMultilingualNeural-Female",
- tts_voice03="en-US-BrianMultilingualNeural-Male",
- tts_voice04="de-DE-SeraphinaMultilingualNeural-Female",
- tts_voice05="de-DE-FlorianMultilingualNeural-Male",
- tts_voice06="fr-FR-VivienneMultilingualNeural-Female",
- tts_voice07="fr-FR-RemyMultilingualNeural-Male",
- tts_voice08="en-US-EmmaMultilingualNeural-Female",
- tts_voice09="en-US-AndrewMultilingualNeural-Male",
- tts_voice10="en-US-EmmaMultilingualNeural-Female",
- tts_voice11="en-US-AndrewMultilingualNeural-Male",
- video_output_name="",
- mix_method_audio="Adjusting volumes and mixing audio",
- max_accelerate_audio=2.1,
- acceleration_rate_regulation=False,
- volume_original_audio=0.25,
- volume_translated_audio=1.80,
- output_format_subtitle="srt",
- get_translated_text=False,
- get_video_from_text_json=False,
- text_json="{}",
- avoid_overlap=False,
- vocal_refinement=False,
- literalize_numbers=True,
- segment_duration_limit=15,
- diarization_model="pyannote_2.1",
- translate_process="google_translator_batch",
- subtitle_file=None,
- output_type="video (mp4)",
- voiceless_track=False,
- voice_imitation=False,
- voice_imitation_max_segments=3,
- voice_imitation_vocals_dereverb=False,
- voice_imitation_remove_previous=True,
- voice_imitation_method="freevc",
- dereverb_automatic_xtts=True,
- text_segmentation_scale="sentence",
- divide_text_segments_by="",
- soft_subtitles_to_video=True,
- burn_subtitles_to_video=False,
- enable_cache=True,
- custom_voices=False,
- custom_voices_workers=1,
- is_gui=False,
- progress=gr.Progress(),
- ):
- if not YOUR_HF_TOKEN:
- YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
- if diarization_model == "disable" or max_speakers == 1:
- if YOUR_HF_TOKEN is None:
- YOUR_HF_TOKEN = ""
- elif not YOUR_HF_TOKEN:
- raise ValueError("No valid Hugging Face token")
- else:
- os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN
-
- if (
- "gpt" in translate_process
- or transcriber_model == "OpenAI_API_Whisper"
- or "OpenAI-TTS" in tts_voice00
- ):
- check_openai_api_key()
-
- if media_file is None:
- media_file = (
- directory_input
- if os.path.exists(directory_input)
- else link_media
- )
- media_file = (
- media_file if isinstance(media_file, str) else media_file.name
- )
-
- if is_subtitle_file(media_file):
- subtitle_file = media_file
- media_file = ""
-
- if media_file is None:
- media_file = ""
-
- if not origin_language:
- origin_language = "Automatic detection"
-
- if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file:
- raise ValueError(
- f"The language '{origin_language}' "
- "is not supported for transcription (ASR)."
- )
-
- if get_translated_text:
- self.edit_subs_complete = False
- if get_video_from_text_json:
- if not self.edit_subs_complete:
- raise ValueError("Generate the transcription first.")
-
- if (
- ("sound" in output_type or output_type == "raw media")
- and (get_translated_text or get_video_from_text_json)
- ):
- raise ValueError(
- "Please disable 'edit generate subtitles' "
- f"first to acquire the {output_type}."
- )
-
- TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
- SOURCE_LANGUAGE = LANGUAGES[origin_language]
-
- if (
- transcriber_model == "OpenAI_API_Whisper"
- and SOURCE_LANGUAGE == "zh-TW"
- ):
- logger.warning(
- "OpenAI API Whisper only supports Chinese (Simplified)."
- )
- SOURCE_LANGUAGE = "zh"
-
- if (
- text_segmentation_scale in ["word", "character"]
- and "subtitle" not in output_type
- ):
- wrn_lang = (
- "Text segmentation by words or characters is typically"
- " used for generating subtitles. If subtitles are not the"
- " intended output, consider selecting 'sentence' "
- "segmentation method to ensure optimal results."
-
- )
- warn_disp(wrn_lang, is_gui)
-
- if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
- wrn_lang = (
- "Make sure to select a 'TTS Speaker' suitable for"
- " the translation language to avoid errors with the TTS."
- )
- warn_disp(wrn_lang, is_gui)
-
- if "_XTTS_" in tts_voice00 and voice_imitation:
- wrn_lang = (
- "When you select XTTS, it is advisable "
- "to disable Voice Imitation."
- )
- warn_disp(wrn_lang, is_gui)
-
- if custom_voices and voice_imitation:
- wrn_lang = (
- "When you use R.V.C. models, it is advisable"
- " to disable Voice Imitation."
- )
- warn_disp(wrn_lang, is_gui)
-
- if not media_file and not subtitle_file:
- raise ValueError(
- "Specifify a media or SRT file in advanced settings"
- )
-
- if subtitle_file:
- subtitle_file = (
- subtitle_file
- if isinstance(subtitle_file, str)
- else subtitle_file.name
- )
-
- if subtitle_file and SOURCE_LANGUAGE == "Automatic detection":
- raise Exception(
- "To use an SRT file, you need to specify its "
- "original language (Source language)"
- )
-
- if not media_file and subtitle_file:
- diarization_model = "disable"
- media_file = "audio_support.wav"
- if not get_video_from_text_json:
- remove_files(media_file)
- srt_data = srt_file_to_segments(subtitle_file)
- total_duration = srt_data["segments"][-1]["end"] + 30.
- support_audio = AudioSegment.silent(
- duration=int(total_duration * 1000)
- )
- support_audio.export(
- media_file, format="wav"
- )
- logger.info("Supporting audio for the SRT file, created.")
-
- if "SET_LIMIT" == os.getenv("DEMO"):
- preview = True
- mix_method_audio = "Adjusting volumes and mixing audio"
- transcriber_model = "medium"
- logger.info(
- "DEMO; set preview=True; Generation is limited to "
- "10 seconds to prevent CPU errors. No limitations with GPU.\n"
- "DEMO; set Adjusting volumes and mixing audio\n"
- "DEMO; set whisper model to medium"
- )
-
- # Check GPU
- if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU:
- logger.info("Compute type changed to float32")
- compute_type = "float32"
-
- base_video_file = "Video.mp4"
- base_audio_wav = "audio.wav"
- dub_audio_file = "audio_dub_solo.ogg"
- vocals_audio_file = "audio_Vocals_DeReverb.wav"
- voiceless_audio_file = "audio_Voiceless.wav"
- mix_audio_file = "audio_mix.mp3"
- vid_subs = "video_subs_file.mp4"
- video_output_file = "video_dub.mp4"
-
- if os.path.exists(media_file):
- media_base_hash = get_hash(media_file)
- else:
- media_base_hash = media_file
- self.clear_cache(media_base_hash, force=(not enable_cache))
-
- if not get_video_from_text_json:
- self.result_diarize = (
- self.align_language
- ) = self.result_source_lang = None
- if not self.task_in_cache("media", [media_base_hash, preview], {}):
- if is_audio_file(media_file):
- prog_disp(
- "Processing audio...", 0.15, is_gui, progress=progress
- )
- audio_preprocessor(preview, media_file, base_audio_wav)
- else:
- prog_disp(
- "Processing video...", 0.15, is_gui, progress=progress
- )
- audio_video_preprocessor(
- preview, media_file, base_video_file, base_audio_wav
- )
- logger.debug("Set file complete.")
-
- if "sound" in output_type:
- prog_disp(
- "Separating sounds in the file...",
- 0.50,
- is_gui,
- progress=progress
- )
- separate_out = sound_separate(base_audio_wav, output_type)
- final_outputs = []
- for out in separate_out:
- final_name = media_out(
- media_file,
- f"{get_no_ext_filename(out)}",
- video_output_name,
- "wav",
- file_obj=out,
- )
- final_outputs.append(final_name)
- logger.info(f"Done: {str(final_outputs)}")
- return final_outputs
-
- if output_type == "raw media":
- output = media_out(
- media_file,
- "raw_media",
- video_output_name,
- "wav" if is_audio_file(media_file) else "mp4",
- file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
- )
- logger.info(f"Done: {output}")
- return output
-
- if os.environ.get("IS_DEMO") == "TRUE":
- duration_verify = librosa.get_duration(filename=base_audio_wav)
- logger.info(f"Duration: {duration_verify} seconds")
- if duration_verify > 1500:
- raise RuntimeError(
- "The audio is too long to process in this demo. Alternatively, you"
- " can install the app locally or use the Colab notebook available "
- "in the SoniTranslate repository."
- )
- elif duration_verify > 300:
- tts_voices_list = [
- tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04,
- tts_voice05, tts_voice06, tts_voice07, tts_voice08, tts_voice09,
- tts_voice10, tts_voice11
- ]
-
- for tts_voice_ in tts_voices_list:
- if "_XTTS_" in tts_voice_:
- raise RuntimeError(
- "XTTS is too slow to be used for audio longer than 5 "
- "minutes in this demo. Alternatively, you can install "
- "the app locally or use the Colab notebook available in"
- " the SoniTranslate repository."
- )
-
- if not self.task_in_cache("refine_vocals", [vocal_refinement], {}):
- self.vocals = None
- if vocal_refinement:
- try:
- from soni_translate.mdx_net import process_uvr_task
- _, _, _, _, file_vocals = process_uvr_task(
- orig_song_path=base_audio_wav,
- main_vocals=False,
- dereverb=True,
- remove_files_output_dir=True,
- )
- remove_files(vocals_audio_file)
- copy_files(file_vocals, ".")
- self.vocals = vocals_audio_file
- except Exception as error:
- logger.error(str(error))
-
- if not self.task_in_cache("transcript_align", [
- subtitle_file,
- SOURCE_LANGUAGE,
- transcriber_model,
- compute_type,
- batch_size,
- literalize_numbers,
- segment_duration_limit,
- (
- "l_unit"
- if text_segmentation_scale in ["word", "character"]
- and subtitle_file
- else "sentence"
- )
- ], {"vocals": self.vocals}):
- if subtitle_file:
- prog_disp(
- "From SRT file...", 0.30, is_gui, progress=progress
- )
- audio = whisperx.load_audio(
- base_audio_wav if not self.vocals else self.vocals
- )
- self.result = srt_file_to_segments(subtitle_file)
- self.result["language"] = SOURCE_LANGUAGE
- else:
- prog_disp(
- "Transcribing...", 0.30, is_gui, progress=progress
- )
- SOURCE_LANGUAGE = (
- None
- if SOURCE_LANGUAGE == "Automatic detection"
- else SOURCE_LANGUAGE
- )
- audio, self.result = transcribe_speech(
- base_audio_wav if not self.vocals else self.vocals,
- transcriber_model,
- compute_type,
- batch_size,
- SOURCE_LANGUAGE,
- literalize_numbers,
- segment_duration_limit,
- )
- logger.debug(
- "Transcript complete, "
- f"segments count {len(self.result['segments'])}"
- )
-
- self.align_language = self.result["language"]
- if (
- not subtitle_file
- or text_segmentation_scale in ["word", "character"]
- ):
- prog_disp("Aligning...", 0.45, is_gui, progress=progress)
- try:
- if self.align_language in ["vi"]:
- logger.info(
- "Deficient alignment for the "
- f"{self.align_language} language, skipping the"
- " process. It is suggested to reduce the "
- "duration of the segments as an alternative."
- )
- else:
- self.result = align_speech(audio, self.result)
- logger.debug(
- "Align complete, "
- f"segments count {len(self.result['segments'])}"
- )
- except Exception as error:
- logger.error(str(error))
-
- if self.result["segments"] == []:
- raise ValueError("No active speech found in audio")
-
- if not self.task_in_cache("break_align", [
- divide_text_segments_by,
- text_segmentation_scale,
- self.align_language
- ], {
- "result": self.result,
- "align_language": self.align_language
- }):
- if self.align_language in ["ja", "zh", "zh-TW"]:
- divide_text_segments_by += "|!|?|...|。"
- if text_segmentation_scale in ["word", "character"]:
- self.result = linguistic_level_segments(
- self.result,
- text_segmentation_scale,
- )
- elif divide_text_segments_by:
- try:
- self.result = break_aling_segments(
- self.result,
- break_characters=divide_text_segments_by,
- )
- except Exception as error:
- logger.error(str(error))
-
- if not self.task_in_cache("diarize", [
- min_speakers,
- max_speakers,
- YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2],
- diarization_model
- ], {
- "result": self.result
- }):
- prog_disp("Diarizing...", 0.60, is_gui, progress=progress)
- diarize_model_select = diarization_models[diarization_model]
- self.result_diarize = diarize_speech(
- base_audio_wav if not self.vocals else self.vocals,
- self.result,
- min_speakers,
- max_speakers,
- YOUR_HF_TOKEN,
- diarize_model_select,
- )
- logger.debug("Diarize complete")
- self.result_source_lang = copy.deepcopy(self.result_diarize)
-
- if not self.task_in_cache("translate", [
- TRANSLATE_AUDIO_TO,
- translate_process
- ], {
- "result_diarize": self.result_diarize
- }):
- prog_disp("Translating...", 0.70, is_gui, progress=progress)
- lang_source = (
- self.align_language
- if self.align_language
- else SOURCE_LANGUAGE
- )
- self.result_diarize["segments"] = translate_text(
- self.result_diarize["segments"],
- TRANSLATE_AUDIO_TO,
- translate_process,
- chunk_size=1800,
- source=lang_source,
- )
- logger.debug("Translation complete")
- logger.debug(self.result_diarize)
-
- if get_translated_text:
-
- json_data = []
- for segment in self.result_diarize["segments"]:
- start = segment["start"]
- text = segment["text"]
- speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1
- json_data.append(
- {"start": start, "text": text, "speaker": speaker}
- )
-
- # Convert list of dictionaries to a JSON string with indentation
- json_string = json.dumps(json_data, indent=2)
- logger.info("Done")
- self.edit_subs_complete = True
- return json_string.encode().decode("unicode_escape")
-
- if get_video_from_text_json:
-
- if self.result_diarize is None:
- raise ValueError("Generate the transcription first.")
- # with open('text_json.json', 'r') as file:
- text_json_loaded = json.loads(text_json)
- for i, segment in enumerate(self.result_diarize["segments"]):
- segment["text"] = text_json_loaded[i]["text"]
- segment["speaker"] = "SPEAKER_{:02d}".format(
- int(text_json_loaded[i]["speaker"]) - 1
- )
-
- # Write subtitle
- if not self.task_in_cache("subs_and_edit", [
- copy.deepcopy(self.result_diarize),
- output_format_subtitle,
- TRANSLATE_AUDIO_TO
- ], {
- "result_diarize": self.result_diarize
- }):
- if output_format_subtitle == "disable":
- self.sub_file = "sub_tra.srt"
- elif output_format_subtitle != "ass":
- self.sub_file = process_subtitles(
- self.result_source_lang,
- self.align_language,
- self.result_diarize,
- output_format_subtitle,
- TRANSLATE_AUDIO_TO,
- )
-
- # Need task
- if output_format_subtitle != "srt":
- _ = process_subtitles(
- self.result_source_lang,
- self.align_language,
- self.result_diarize,
- "srt",
- TRANSLATE_AUDIO_TO,
- )
-
- if output_format_subtitle == "ass":
- convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y"
- convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y"
- self.sub_file = "sub_tra.ass"
- run_command(convert_ori)
- run_command(convert_tra)
-
- format_sub = (
- output_format_subtitle
- if output_format_subtitle != "disable"
- else "srt"
- )
-
- if output_type == "subtitle":
-
- out_subs = []
- tra_subs = media_out(
- media_file,
- TRANSLATE_AUDIO_TO,
- video_output_name,
- format_sub,
- file_obj=self.sub_file,
- )
- out_subs.append(tra_subs)
-
- ori_subs = media_out(
- media_file,
- self.align_language,
- video_output_name,
- format_sub,
- file_obj=f"sub_ori.{format_sub}",
- )
- out_subs.append(ori_subs)
- logger.info(f"Done: {out_subs}")
- return out_subs
-
- if output_type == "subtitle [by speaker]":
- output = get_subtitle_speaker(
- media_file,
- result=self.result_diarize,
- language=TRANSLATE_AUDIO_TO,
- extension=format_sub,
- base_name=video_output_name,
- )
- logger.info(f"Done: {str(output)}")
- return output
-
- if "video [subtitled]" in output_type:
- output = media_out(
- media_file,
- TRANSLATE_AUDIO_TO + "_subtitled",
- video_output_name,
- "wav" if is_audio_file(media_file) else (
- "mkv" if "mkv" in output_type else "mp4"
- ),
- file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
- soft_subtitles=False if is_audio_file(media_file) else True,
- subtitle_files=output_format_subtitle,
- )
- msg_out = output[0] if isinstance(output, list) else output
- logger.info(f"Done: {msg_out}")
- return output
-
- if not self.task_in_cache("tts", [
- TRANSLATE_AUDIO_TO,
- tts_voice00,
- tts_voice01,
- tts_voice02,
- tts_voice03,
- tts_voice04,
- tts_voice05,
- tts_voice06,
- tts_voice07,
- tts_voice08,
- tts_voice09,
- tts_voice10,
- tts_voice11,
- dereverb_automatic_xtts
- ], {
- "sub_file": self.sub_file
- }):
- prog_disp("Text to speech...", 0.80, is_gui, progress=progress)
- self.valid_speakers = audio_segmentation_to_voice(
- self.result_diarize,
- TRANSLATE_AUDIO_TO,
- is_gui,
- tts_voice00,
- tts_voice01,
- tts_voice02,
- tts_voice03,
- tts_voice04,
- tts_voice05,
- tts_voice06,
- tts_voice07,
- tts_voice08,
- tts_voice09,
- tts_voice10,
- tts_voice11,
- dereverb_automatic_xtts,
- )
-
- if not self.task_in_cache("acc_and_vc", [
- max_accelerate_audio,
- acceleration_rate_regulation,
- voice_imitation,
- voice_imitation_max_segments,
- voice_imitation_remove_previous,
- voice_imitation_vocals_dereverb,
- voice_imitation_method,
- custom_voices,
- custom_voices_workers,
- copy.deepcopy(self.vci.model_config),
- avoid_overlap
- ], {
- "valid_speakers": self.valid_speakers
- }):
- audio_files, speakers_list = accelerate_segments(
- self.result_diarize,
- max_accelerate_audio,
- self.valid_speakers,
- acceleration_rate_regulation,
- )
-
- # Voice Imitation (Tone color converter)
- if voice_imitation:
- prog_disp(
- "Voice Imitation...", 0.85, is_gui, progress=progress
- )
- from soni_translate.text_to_speech import toneconverter
-
- try:
- toneconverter(
- copy.deepcopy(self.result_diarize),
- voice_imitation_max_segments,
- voice_imitation_remove_previous,
- voice_imitation_vocals_dereverb,
- voice_imitation_method,
- )
- except Exception as error:
- logger.error(str(error))
-
- # custom voice
- if custom_voices:
- prog_disp(
- "Applying customized voices...",
- 0.90,
- is_gui,
- progress=progress,
- )
-
- try:
- self.vci(
- audio_files,
- speakers_list,
- overwrite=True,
- parallel_workers=custom_voices_workers,
- )
- self.vci.unload_models()
- except Exception as error:
- logger.error(str(error))
-
- prog_disp(
- "Creating final translated video...",
- 0.95,
- is_gui,
- progress=progress,
- )
- remove_files(dub_audio_file)
- create_translated_audio(
- self.result_diarize,
- audio_files,
- dub_audio_file,
- False,
- avoid_overlap,
- )
-
- # Voiceless track, change with file
- hash_base_audio_wav = get_hash(base_audio_wav)
- if voiceless_track:
- if self.voiceless_id != hash_base_audio_wav:
- from soni_translate.mdx_net import process_uvr_task
-
- try:
- # voiceless_audio_file_dir = "clean_song_output/voiceless"
- remove_files(voiceless_audio_file)
- uvr_voiceless_audio_wav, _ = process_uvr_task(
- orig_song_path=base_audio_wav,
- song_id="voiceless",
- only_voiceless=True,
- remove_files_output_dir=False,
- )
- copy_files(uvr_voiceless_audio_wav, ".")
- base_audio_wav = voiceless_audio_file
- self.voiceless_id = hash_base_audio_wav
-
- except Exception as error:
- logger.error(str(error))
- else:
- base_audio_wav = voiceless_audio_file
-
- if not self.task_in_cache("mix_aud", [
- mix_method_audio,
- volume_original_audio,
- volume_translated_audio,
- voiceless_track
- ], {}):
- # TYPE MIX AUDIO
- remove_files(mix_audio_file)
- command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}'
- command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}'
- if mix_method_audio == "Adjusting volumes and mixing audio":
- # volume mix
- run_command(command_volume_mix)
- else:
- try:
- # background mix
- run_command(command_background_mix)
- except Exception as error_mix:
- # volume mix except
- logger.error(str(error_mix))
- run_command(command_volume_mix)
-
- if "audio" in output_type or is_audio_file(media_file):
- output = media_out(
- media_file,
- TRANSLATE_AUDIO_TO,
- video_output_name,
- "wav" if "wav" in output_type else (
- "ogg" if "ogg" in output_type else "mp3"
- ),
- file_obj=mix_audio_file,
- subtitle_files=output_format_subtitle,
- )
- msg_out = output[0] if isinstance(output, list) else output
- logger.info(f"Done: {msg_out}")
- return output
-
- hash_base_video_file = get_hash(base_video_file)
-
- if burn_subtitles_to_video:
- hashvideo_text = [
- hash_base_video_file,
- [seg["text"] for seg in self.result_diarize["segments"]]
- ]
- if self.burn_subs_id != hashvideo_text:
- try:
- logger.info("Burn subtitles")
- remove_files(vid_subs)
- command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}"
- run_command(command)
- base_video_file = vid_subs
- self.burn_subs_id = hashvideo_text
- except Exception as error:
- logger.error(str(error))
- else:
- base_video_file = vid_subs
-
- if not self.task_in_cache("output", [
- hash_base_video_file,
- hash_base_audio_wav,
- burn_subtitles_to_video
- ], {}):
- # Merge new audio + video
- remove_files(video_output_file)
- run_command(
- f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}"
- )
-
- output = media_out(
- media_file,
- TRANSLATE_AUDIO_TO,
- video_output_name,
- "mkv" if "mkv" in output_type else "mp4",
- file_obj=video_output_file,
- soft_subtitles=soft_subtitles_to_video,
- subtitle_files=output_format_subtitle,
- )
- msg_out = output[0] if isinstance(output, list) else output
- logger.info(f"Done: {msg_out}")
-
- return output
-
- def hook_beta_processor(
- self,
- document,
- tgt_lang,
- translate_process,
- ori_lang,
- tts,
- name_final_file,
- custom_voices,
- custom_voices_workers,
- output_type,
- chunk_size,
- width,
- height,
- start_page,
- end_page,
- bcolor,
- is_gui,
- progress
- ):
- prog_disp("Processing pages...", 0.10, is_gui, progress=progress)
- doc_data = doc_to_txtximg_pages(document, width, height, start_page, end_page, bcolor)
- result_diarize = page_data_to_segments(doc_data, 1700)
-
- prog_disp("Translating...", 0.20, is_gui, progress=progress)
- result_diarize["segments"] = translate_text(
- result_diarize["segments"],
- tgt_lang,
- translate_process,
- chunk_size=0,
- source=ori_lang,
- )
- chunk_size = (
- chunk_size if chunk_size else determine_chunk_size(tts)
- )
- doc_data = update_page_data(result_diarize, doc_data)
-
- prog_disp("Text to speech...", 0.30, is_gui, progress=progress)
- result_diarize = page_data_to_segments(doc_data, chunk_size)
- valid_speakers = audio_segmentation_to_voice(
- result_diarize,
- tgt_lang,
- is_gui,
- tts,
- )
-
- # fix format and set folder output
- audio_files, speakers_list = accelerate_segments(
- result_diarize,
- 1.0,
- valid_speakers,
- )
-
- # custom voice
- if custom_voices:
- prog_disp(
- "Applying customized voices...",
- 0.60,
- is_gui,
- progress=progress,
- )
- self.vci(
- audio_files,
- speakers_list,
- overwrite=True,
- parallel_workers=custom_voices_workers,
- )
- self.vci.unload_models()
-
- # Update time segments and not concat
- result_diarize = fix_timestamps_docs(result_diarize, audio_files)
- final_wav_file = "audio_book.wav"
- remove_files(final_wav_file)
-
- prog_disp("Creating audio file...", 0.70, is_gui, progress=progress)
- create_translated_audio(
- result_diarize, audio_files, final_wav_file, False
- )
-
- prog_disp("Creating video file...", 0.80, is_gui, progress=progress)
- video_doc = create_video_from_images(
- doc_data,
- result_diarize
- )
-
- # Merge video and audio
- prog_disp("Merging...", 0.90, is_gui, progress=progress)
- vid_out = merge_video_and_audio(video_doc, final_wav_file)
-
- # End
- output = media_out(
- document,
- tgt_lang,
- name_final_file,
- "mkv" if "mkv" in output_type else "mp4",
- file_obj=vid_out,
- )
- logger.info(f"Done: {output}")
- return output
-
- def multilingual_docs_conversion(
- self,
- string_text="", # string
- document=None, # doc path gui
- directory_input="", # doc path
- origin_language="English (en)",
- target_language="English (en)",
- tts_voice00="en-US-EmmaMultilingualNeural-Female",
- name_final_file="",
- translate_process="google_translator",
- output_type="audio",
- chunk_size=None,
- custom_voices=False,
- custom_voices_workers=1,
- start_page=1,
- end_page=99999,
- width=1280,
- height=720,
- bcolor="dynamic",
- is_gui=False,
- progress=gr.Progress(),
- ):
- if "gpt" in translate_process:
- check_openai_api_key()
-
- SOURCE_LANGUAGE = LANGUAGES[origin_language]
- if translate_process != "disable_translation":
- TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
- else:
- TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE
- logger.info("No translation")
- if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
- logger.debug(
- "Make sure to select a 'TTS Speaker' suitable for the "
- "translation language to avoid errors with the TTS."
- )
-
- self.clear_cache(string_text, force=True)
-
- is_string = False
- if document is None:
- if os.path.exists(directory_input):
- document = directory_input
- else:
- document = string_text
- is_string = True
- document = document if isinstance(document, str) else document.name
- if not document:
- raise Exception("No data found")
-
- if os.environ.get("IS_DEMO") == "TRUE" and not is_string:
- raise RuntimeError(
- "This option is disabled in this demo. "
- "Alternatively, you can install "
- "the app locally or use the Colab notebook available in"
- " the SoniTranslate repository."
- )
-
- if "videobook" in output_type:
- if not document.lower().endswith(".pdf"):
- raise ValueError(
- "Videobooks are only compatible with PDF files."
- )
-
- return self.hook_beta_processor(
- document,
- TRANSLATE_AUDIO_TO,
- translate_process,
- SOURCE_LANGUAGE,
- tts_voice00,
- name_final_file,
- custom_voices,
- custom_voices_workers,
- output_type,
- chunk_size,
- width,
- height,
- start_page,
- end_page,
- bcolor,
- is_gui,
- progress
- )
-
- # audio_wav = "audio.wav"
- final_wav_file = "audio_book.wav"
-
- prog_disp("Processing text...", 0.15, is_gui, progress=progress)
- result_file_path, result_text = document_preprocessor(
- document, is_string, start_page, end_page
- )
-
- if (
- output_type == "book (txt)"
- and translate_process == "disable_translation"
- ):
- return result_file_path
-
- if "SET_LIMIT" == os.getenv("DEMO"):
- result_text = result_text[:50]
- logger.info(
- "DEMO; Generation is limited to 50 characters to prevent "
- "CPU errors. No limitations with GPU.\n"
- )
-
- if translate_process != "disable_translation":
- # chunks text for translation
- result_diarize = plain_text_to_segments(result_text, 1700)
- prog_disp("Translating...", 0.30, is_gui, progress=progress)
- # not or iterative with 1700 chars
- result_diarize["segments"] = translate_text(
- result_diarize["segments"],
- TRANSLATE_AUDIO_TO,
- translate_process,
- chunk_size=0,
- source=SOURCE_LANGUAGE,
- )
-
- txt_file_path, result_text = segments_to_plain_text(result_diarize)
-
- if output_type == "book (txt)":
- return media_out(
- result_file_path if is_string else document,
- TRANSLATE_AUDIO_TO,
- name_final_file,
- "txt",
- file_obj=txt_file_path,
- )
-
- # (TTS limits) plain text to result_diarize
- chunk_size = (
- chunk_size if chunk_size else determine_chunk_size(tts_voice00)
- )
- result_diarize = plain_text_to_segments(result_text, chunk_size)
- logger.debug(result_diarize)
-
- prog_disp("Text to speech...", 0.45, is_gui, progress=progress)
- valid_speakers = audio_segmentation_to_voice(
- result_diarize,
- TRANSLATE_AUDIO_TO,
- is_gui,
- tts_voice00,
- )
-
- # fix format and set folder output
- audio_files, speakers_list = accelerate_segments(
- result_diarize,
- 1.0,
- valid_speakers,
- )
-
- # custom voice
- if custom_voices:
- prog_disp(
- "Applying customized voices...",
- 0.80,
- is_gui,
- progress=progress,
- )
- self.vci(
- audio_files,
- speakers_list,
- overwrite=True,
- parallel_workers=custom_voices_workers,
- )
- self.vci.unload_models()
-
- prog_disp(
- "Creating final audio file...", 0.90, is_gui, progress=progress
- )
- remove_files(final_wav_file)
- create_translated_audio(
- result_diarize, audio_files, final_wav_file, True
- )
-
- output = media_out(
- result_file_path if is_string else document,
- TRANSLATE_AUDIO_TO,
- name_final_file,
- "mp3" if "mp3" in output_type else (
- "ogg" if "ogg" in output_type else "wav"
- ),
- file_obj=final_wav_file,
- )
-
- logger.info(f"Done: {output}")
-
- return output
-
-
-title = "
📽️ SoniTranslate 🈷️"
-
-
-def create_gui(theme, logs_in_gui=False):
- with gr.Blocks(theme=theme) as app:
- gr.Markdown(title)
- gr.Markdown(lg_conf["description"])
-
- if os.environ.get("ZERO_GPU") == "TRUE":
- gr.Markdown(
- """
-
-
- ⚠️ Important ⚠️
-
- - 🚀 This demo uses a zero GPU setup only for the transcription and diarization process. Everything else runs on the CPU. It is recommended to use videos no longer than 15 minutes. ⏳
- - ❗ If you see `queue` when using this, it means another user is currently using it, and you need to wait until they are finished.
- - 🔒 Some functions are disabled, but if you duplicate this with a GPU and set the value in secrets "ZERO_GPU" to FALSE, you can use the app with full GPU acceleration. ⚡
-
-
- """
- )
-
- with gr.Tab(lg_conf["tab_translate"]):
- with gr.Row():
- with gr.Column():
- input_data_type = gr.Dropdown(
- ["SUBMIT VIDEO", "URL", "Find Video Path"],
- value="SUBMIT VIDEO",
- label=lg_conf["video_source"],
- )
-
- def swap_visibility(data_type):
- if data_type == "URL":
- return (
- gr.update(visible=False, value=None),
- gr.update(visible=True, value=""),
- gr.update(visible=False, value=""),
- )
- elif data_type == "SUBMIT VIDEO":
- return (
- gr.update(visible=True, value=None),
- gr.update(visible=False, value=""),
- gr.update(visible=False, value=""),
- )
- elif data_type == "Find Video Path":
- return (
- gr.update(visible=False, value=None),
- gr.update(visible=False, value=""),
- gr.update(visible=True, value=""),
- )
-
- video_input = gr.File(
- label="VIDEO",
- file_count="multiple",
- type="filepath",
- )
- blink_input = gr.Textbox(
- visible=False,
- label=lg_conf["link_label"],
- info=lg_conf["link_info"],
- placeholder=lg_conf["link_ph"],
- )
- directory_input = gr.Textbox(
- visible=False,
- label=lg_conf["dir_label"],
- info=lg_conf["dir_info"],
- placeholder=lg_conf["dir_ph"],
- )
- input_data_type.change(
- fn=swap_visibility,
- inputs=input_data_type,
- outputs=[video_input, blink_input, directory_input],
- )
-
- gr.HTML()
-
- SOURCE_LANGUAGE = gr.Dropdown(
- LANGUAGES_LIST,
- value=LANGUAGES_LIST[0],
- label=lg_conf["sl_label"],
- info=lg_conf["sl_info"],
- )
- TRANSLATE_AUDIO_TO = gr.Dropdown(
- LANGUAGES_LIST[1:],
- value="English (en)",
- label=lg_conf["tat_label"],
- info=lg_conf["tat_info"],
- )
-
- gr.HTML("
")
-
- gr.Markdown(lg_conf["num_speakers"])
- MAX_TTS = 12
- min_speakers = gr.Slider(
- 1,
- MAX_TTS,
- value=1,
- label=lg_conf["min_sk"],
- step=1,
- visible=False,
- )
- max_speakers = gr.Slider(
- 1,
- MAX_TTS,
- value=1,
- step=1,
- label=lg_conf["max_sk"],
- )
- gr.Markdown(lg_conf["tts_select"])
-
- def submit(value):
- visibility_dict = {
- f"tts_voice{i:02d}": gr.update(visible=i < value)
- for i in range(MAX_TTS)
- }
- return [value for value in visibility_dict.values()]
-
- tts_voice00 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="en-US-EmmaMultilingualNeural-Female",
- label=lg_conf["sk1"],
- visible=True,
- interactive=True,
- )
- tts_voice01 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="en-US-AndrewMultilingualNeural-Male",
- label=lg_conf["sk2"],
- visible=False,
- interactive=True,
- )
- tts_voice02 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="en-US-AvaMultilingualNeural-Female",
- label=lg_conf["sk3"],
- visible=False,
- interactive=True,
- )
- tts_voice03 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="en-US-BrianMultilingualNeural-Male",
- label=lg_conf["sk4"],
- visible=False,
- interactive=True,
- )
- tts_voice04 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="de-DE-SeraphinaMultilingualNeural-Female",
- label=lg_conf["sk4"],
- visible=False,
- interactive=True,
- )
- tts_voice05 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="de-DE-FlorianMultilingualNeural-Male",
- label=lg_conf["sk6"],
- visible=False,
- interactive=True,
- )
- tts_voice06 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="fr-FR-VivienneMultilingualNeural-Female",
- label=lg_conf["sk7"],
- visible=False,
- interactive=True,
- )
- tts_voice07 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="fr-FR-RemyMultilingualNeural-Male",
- label=lg_conf["sk8"],
- visible=False,
- interactive=True,
- )
- tts_voice08 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="en-US-EmmaMultilingualNeural-Female",
- label=lg_conf["sk9"],
- visible=False,
- interactive=True,
- )
- tts_voice09 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="en-US-AndrewMultilingualNeural-Male",
- label=lg_conf["sk10"],
- visible=False,
- interactive=True,
- )
- tts_voice10 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="en-US-EmmaMultilingualNeural-Female",
- label=lg_conf["sk11"],
- visible=False,
- interactive=True,
- )
- tts_voice11 = gr.Dropdown(
- SoniTr.tts_info.tts_list(),
- value="en-US-AndrewMultilingualNeural-Male",
- label=lg_conf["sk12"],
- visible=False,
- interactive=True,
- )
- max_speakers.change(
- submit,
- max_speakers,
- [
- tts_voice00,
- tts_voice01,
- tts_voice02,
- tts_voice03,
- tts_voice04,
- tts_voice05,
- tts_voice06,
- tts_voice07,
- tts_voice08,
- tts_voice09,
- tts_voice10,
- tts_voice11,
- ],
- )
-
- with gr.Column():
- with gr.Accordion(
- lg_conf["vc_title"],
- open=False,
- ):
- gr.Markdown(lg_conf["vc_subtitle"])
- voice_imitation_gui = gr.Checkbox(
- False,
- label=lg_conf["vc_active_label"],
- info=lg_conf["vc_active_info"],
- )
- openvoice_models = ["openvoice", "openvoice_v2"]
- voice_imitation_method_options = (
- ["freevc"] + openvoice_models
- if SoniTr.tts_info.xtts_enabled
- else openvoice_models
- )
- voice_imitation_method_gui = gr.Dropdown(
- voice_imitation_method_options,
- value=voice_imitation_method_options[-1],
- label=lg_conf["vc_method_label"],
- info=lg_conf["vc_method_info"],
- )
- voice_imitation_max_segments_gui = gr.Slider(
- label=lg_conf["vc_segments_label"],
- info=lg_conf["vc_segments_info"],
- value=3,
- step=1,
- minimum=1,
- maximum=10,
- visible=True,
- interactive=True,
- )
- voice_imitation_vocals_dereverb_gui = gr.Checkbox(
- False,
- label=lg_conf["vc_dereverb_label"],
- info=lg_conf["vc_dereverb_info"],
- )
- voice_imitation_remove_previous_gui = gr.Checkbox(
- True,
- label=lg_conf["vc_remove_label"],
- info=lg_conf["vc_remove_info"],
- )
-
- if SoniTr.tts_info.xtts_enabled:
- with gr.Column():
- with gr.Accordion(
- lg_conf["xtts_title"],
- open=False,
- ):
- gr.Markdown(lg_conf["xtts_subtitle"])
- wav_speaker_file = gr.File(
- label=lg_conf["xtts_file_label"]
- )
- wav_speaker_name = gr.Textbox(
- label=lg_conf["xtts_name_label"],
- value="",
- info=lg_conf["xtts_name_info"],
- placeholder="default_name",
- lines=1,
- )
- wav_speaker_start = gr.Number(
- label="Time audio start",
- value=0,
- visible=False,
- )
- wav_speaker_end = gr.Number(
- label="Time audio end",
- value=0,
- visible=False,
- )
- wav_speaker_dir = gr.Textbox(
- label="Directory save",
- value="_XTTS_",
- visible=False,
- )
- wav_speaker_dereverb = gr.Checkbox(
- True,
- label=lg_conf["xtts_dereverb_label"],
- info=lg_conf["xtts_dereverb_info"]
- )
- wav_speaker_output = gr.HTML()
- create_xtts_wav = gr.Button(
- lg_conf["xtts_button"]
- )
- gr.Markdown(lg_conf["xtts_footer"])
- else:
- wav_speaker_dereverb = gr.Checkbox(
- False,
- label=lg_conf["xtts_dereverb_label"],
- info=lg_conf["xtts_dereverb_info"],
- visible=False
- )
-
- with gr.Column():
- with gr.Accordion(
- lg_conf["extra_setting"], open=False
- ):
- audio_accelerate = gr.Slider(
- label=lg_conf["acc_max_label"],
- value=1.9,
- step=0.1,
- minimum=1.0,
- maximum=2.5,
- visible=True,
- interactive=True,
- info=lg_conf["acc_max_info"],
- )
- acceleration_rate_regulation_gui = gr.Checkbox(
- False,
- label=lg_conf["acc_rate_label"],
- info=lg_conf["acc_rate_info"],
- )
- avoid_overlap_gui = gr.Checkbox(
- False,
- label=lg_conf["or_label"],
- info=lg_conf["or_info"],
- )
-
- gr.HTML("
")
-
- audio_mix_options = [
- "Mixing audio with sidechain compression",
- "Adjusting volumes and mixing audio",
- ]
- AUDIO_MIX = gr.Dropdown(
- audio_mix_options,
- value=audio_mix_options[1],
- label=lg_conf["aud_mix_label"],
- info=lg_conf["aud_mix_info"],
- )
- volume_original_mix = gr.Slider(
- label=lg_conf["vol_ori"],
- info="for Adjusting volumes and mixing audio",
- value=0.25,
- step=0.05,
- minimum=0.0,
- maximum=2.50,
- visible=True,
- interactive=True,
- )
- volume_translated_mix = gr.Slider(
- label=lg_conf["vol_tra"],
- info="for Adjusting volumes and mixing audio",
- value=1.80,
- step=0.05,
- minimum=0.0,
- maximum=2.50,
- visible=True,
- interactive=True,
- )
- main_voiceless_track = gr.Checkbox(
- label=lg_conf["voiceless_tk_label"],
- info=lg_conf["voiceless_tk_info"],
- )
-
- gr.HTML("
")
- sub_type_options = [
- "disable",
- "srt",
- "vtt",
- "ass",
- "txt",
- "tsv",
- "json",
- "aud",
- ]
-
- sub_type_output = gr.Dropdown(
- sub_type_options,
- value=sub_type_options[1],
- label=lg_conf["sub_type"],
- )
- soft_subtitles_to_video_gui = gr.Checkbox(
- label=lg_conf["soft_subs_label"],
- info=lg_conf["soft_subs_info"],
- )
- burn_subtitles_to_video_gui = gr.Checkbox(
- label=lg_conf["burn_subs_label"],
- info=lg_conf["burn_subs_info"],
- )
-
- gr.HTML("
")
- gr.Markdown(lg_conf["whisper_title"])
- literalize_numbers_gui = gr.Checkbox(
- True,
- label=lg_conf["lnum_label"],
- info=lg_conf["lnum_info"],
- )
- vocal_refinement_gui = gr.Checkbox(
- False,
- label=lg_conf["scle_label"],
- info=lg_conf["scle_info"],
- )
- segment_duration_limit_gui = gr.Slider(
- label=lg_conf["sd_limit_label"],
- info=lg_conf["sd_limit_info"],
- value=15,
- step=1,
- minimum=1,
- maximum=30,
- )
- whisper_model_default = (
- "large-v3"
- if SoniTr.device == "cuda"
- else "medium"
- )
-
- WHISPER_MODEL_SIZE = gr.Dropdown(
- ASR_MODEL_OPTIONS + find_whisper_models(),
- value=whisper_model_default,
- label="Whisper ASR model",
- info=lg_conf["asr_model_info"],
- allow_custom_value=True,
- )
- com_t_opt, com_t_default = (
- [COMPUTE_TYPE_GPU, "float16"]
- if SoniTr.device == "cuda"
- else [COMPUTE_TYPE_CPU, "float32"]
- )
- compute_type = gr.Dropdown(
- com_t_opt,
- value=com_t_default,
- label=lg_conf["ctype_label"],
- info=lg_conf["ctype_info"],
- )
- batch_size_value = 8 if os.environ.get("ZERO_GPU") != "TRUE" else 32
- batch_size = gr.Slider(
- minimum=1,
- maximum=32,
- value=batch_size_value,
- label=lg_conf["batchz_label"],
- info=lg_conf["batchz_info"],
- step=1,
- )
- input_srt = gr.File(
- label=lg_conf["srt_file_label"],
- file_types=[".srt", ".ass", ".vtt"],
- height=130,
- )
-
- gr.HTML("
")
- text_segmentation_options = [
- "sentence",
- "word",
- "character"
- ]
- text_segmentation_scale_gui = gr.Dropdown(
- text_segmentation_options,
- value=text_segmentation_options[0],
- label=lg_conf["tsscale_label"],
- info=lg_conf["tsscale_info"],
- )
- divide_text_segments_by_gui = gr.Textbox(
- label=lg_conf["divide_text_label"],
- value="",
- info=lg_conf["divide_text_info"],
- )
-
- gr.HTML("
")
- pyannote_models_list = list(
- diarization_models.keys()
- )
- diarization_process_dropdown = gr.Dropdown(
- pyannote_models_list,
- value=pyannote_models_list[1],
- label=lg_conf["diarization_label"],
- )
- translate_process_dropdown = gr.Dropdown(
- TRANSLATION_PROCESS_OPTIONS,
- value=TRANSLATION_PROCESS_OPTIONS[0],
- label=lg_conf["tr_process_label"],
- )
-
- gr.HTML("
")
- main_output_type = gr.Dropdown(
- OUTPUT_TYPE_OPTIONS,
- value=OUTPUT_TYPE_OPTIONS[0],
- label=lg_conf["out_type_label"],
- )
- VIDEO_OUTPUT_NAME = gr.Textbox(
- label=lg_conf["out_name_label"],
- value="",
- info=lg_conf["out_name_info"],
- )
- play_sound_gui = gr.Checkbox(
- True,
- label=lg_conf["task_sound_label"],
- info=lg_conf["task_sound_info"],
- )
- enable_cache_gui = gr.Checkbox(
- True,
- label=lg_conf["cache_label"],
- info=lg_conf["cache_info"],
- )
- PREVIEW = gr.Checkbox(
- label="Preview", info=lg_conf["preview_info"]
- )
- is_gui_dummy_check = gr.Checkbox(
- True, visible=False
- )
-
- with gr.Column(variant="compact"):
- edit_sub_check = gr.Checkbox(
- label=lg_conf["edit_sub_label"],
- info=lg_conf["edit_sub_info"],
- interactive=(False if os.environ.get("IS_DEMO") == "TRUE" else True),
- )
- dummy_false_check = gr.Checkbox(
- False,
- visible=False,
- )
-
- def visible_component_subs(input_bool):
- if input_bool:
- return gr.update(visible=True), gr.update(
- visible=True
- )
- else:
- return gr.update(visible=False), gr.update(
- visible=False
- )
-
- subs_button = gr.Button(
- lg_conf["button_subs"],
- variant="primary",
- visible=False,
- )
- subs_edit_space = gr.Textbox(
- visible=False,
- lines=10,
- label=lg_conf["editor_sub_label"],
- info=lg_conf["editor_sub_info"],
- placeholder=lg_conf["editor_sub_ph"],
- )
- edit_sub_check.change(
- visible_component_subs,
- [edit_sub_check],
- [subs_button, subs_edit_space],
- )
-
- with gr.Row():
- video_button = gr.Button(
- lg_conf["button_translate"],
- variant="primary",
- )
- with gr.Row():
- video_output = gr.File(
- label=lg_conf["output_result_label"],
- file_count="multiple",
- interactive=False,
-
- ) # gr.Video()
-
- gr.HTML("
")
-
- if (
- os.getenv("YOUR_HF_TOKEN") is None
- or os.getenv("YOUR_HF_TOKEN") == ""
- ):
- HFKEY = gr.Textbox(
- visible=True,
- label="HF Token",
- info=lg_conf["ht_token_info"],
- placeholder=lg_conf["ht_token_ph"],
- )
- else:
- HFKEY = gr.Textbox(
- visible=False,
- label="HF Token",
- info=lg_conf["ht_token_info"],
- placeholder=lg_conf["ht_token_ph"],
- )
-
- gr.Examples(
- examples=[
- [
- ["./assets/Video_main.mp4"],
- "",
- "",
- "",
- False,
- whisper_model_default,
- batch_size_value,
- com_t_default,
- "Spanish (es)",
- "English (en)",
- 1,
- 2,
- "en-US-EmmaMultilingualNeural-Female",
- "en-US-AndrewMultilingualNeural-Male",
- ],
- ], # no update
- fn=SoniTr.batch_multilingual_media_conversion,
- inputs=[
- video_input,
- blink_input,
- directory_input,
- HFKEY,
- PREVIEW,
- WHISPER_MODEL_SIZE,
- batch_size,
- compute_type,
- SOURCE_LANGUAGE,
- TRANSLATE_AUDIO_TO,
- min_speakers,
- max_speakers,
- tts_voice00,
- tts_voice01,
- ],
- outputs=[video_output],
- cache_examples=False,
- )
-
- with gr.Tab(lg_conf["tab_docs"]):
- with gr.Column():
- with gr.Accordion("Docs", open=True):
- with gr.Column(variant="compact"):
- with gr.Column():
- input_doc_type = gr.Dropdown(
- [
- "WRITE TEXT",
- "SUBMIT DOCUMENT",
- "Find Document Path",
- ],
- value="SUBMIT DOCUMENT",
- label=lg_conf["docs_input_label"],
- info=lg_conf["docs_input_info"],
- )
-
- def swap_visibility(data_type):
- if data_type == "WRITE TEXT":
- return (
- gr.update(visible=True, value=""),
- gr.update(visible=False, value=None),
- gr.update(visible=False, value=""),
- )
- elif data_type == "SUBMIT DOCUMENT":
- return (
- gr.update(visible=False, value=""),
- gr.update(visible=True, value=None),
- gr.update(visible=False, value=""),
- )
- elif data_type == "Find Document Path":
- return (
- gr.update(visible=False, value=""),
- gr.update(visible=False, value=None),
- gr.update(visible=True, value=""),
- )
-
- text_docs = gr.Textbox(
- label="Text",
- value="This is an example",
- info="Write a text",
- placeholder="...",
- lines=5,
- visible=False,
- )
- input_docs = gr.File(
- label="Document", visible=True
- )
- directory_input_docs = gr.Textbox(
- visible=False,
- label="Document Path",
- info="Example: /home/my_doc.pdf",
- placeholder="Path goes here...",
- )
- input_doc_type.change(
- fn=swap_visibility,
- inputs=input_doc_type,
- outputs=[
- text_docs,
- input_docs,
- directory_input_docs,
- ],
- )
-
- gr.HTML()
-
- tts_documents = gr.Dropdown(
- list(
- filter(
- lambda x: x != "_XTTS_/AUTOMATIC.wav",
- SoniTr.tts_info.tts_list(),
- )
- ),
- value="en-US-EmmaMultilingualNeural-Female",
- label="TTS",
- visible=True,
- interactive=True,
- )
-
- gr.HTML()
-
- docs_SOURCE_LANGUAGE = gr.Dropdown(
- LANGUAGES_LIST[1:],
- value="English (en)",
- label=lg_conf["sl_label"],
- info=lg_conf["docs_source_info"],
- )
- docs_TRANSLATE_TO = gr.Dropdown(
- LANGUAGES_LIST[1:],
- value="English (en)",
- label=lg_conf["tat_label"],
- info=lg_conf["tat_info"],
- )
-
- with gr.Column():
- with gr.Accordion(
- lg_conf["extra_setting"], open=False
- ):
- docs_translate_process_dropdown = gr.Dropdown(
- DOCS_TRANSLATION_PROCESS_OPTIONS,
- value=DOCS_TRANSLATION_PROCESS_OPTIONS[
- 0
- ],
- label="Translation process",
- )
-
- gr.HTML("
")
-
- docs_output_type = gr.Dropdown(
- DOCS_OUTPUT_TYPE_OPTIONS,
- value=DOCS_OUTPUT_TYPE_OPTIONS[2],
- label="Output type",
- )
- docs_OUTPUT_NAME = gr.Textbox(
- label="Final file name",
- value="",
- info=lg_conf["out_name_info"],
- )
- docs_chunk_size = gr.Number(
- label=lg_conf["chunk_size_label"],
- value=0,
- visible=True,
- interactive=True,
- info=lg_conf["chunk_size_info"],
- )
- gr.HTML("
")
- start_page_gui = gr.Number(
- step=1,
- value=1,
- minimum=1,
- maximum=99999,
- label="Start page",
- )
- end_page_gui = gr.Number(
- step=1,
- value=99999,
- minimum=1,
- maximum=99999,
- label="End page",
- )
- gr.HTML("
Videobook config")
- videobook_width_gui = gr.Number(
- step=1,
- value=1280,
- minimum=100,
- maximum=4096,
- label="Width",
- )
- videobook_height_gui = gr.Number(
- step=1,
- value=720,
- minimum=100,
- maximum=4096,
- label="Height",
- )
- videobook_bcolor_gui = gr.Dropdown(
- BORDER_COLORS,
- value=BORDER_COLORS[0],
- label="Border color",
- )
- docs_dummy_check = gr.Checkbox(
- True, visible=False
- )
-
- with gr.Row():
- docs_button = gr.Button(
- lg_conf["docs_button"],
- variant="primary",
- )
- with gr.Row():
- docs_output = gr.File(
- label="Result",
- interactive=False,
- )
-
- with gr.Tab("Custom voice R.V.C. (Optional)"):
-
- with gr.Column():
- with gr.Accordion("Get the R.V.C. Models", open=True):
- url_links = gr.Textbox(
- label="URLs",
- value="",
- info=lg_conf["cv_url_info"],
- placeholder="urls here...",
- lines=1,
- )
- download_finish = gr.HTML()
- download_button = gr.Button("DOWNLOAD MODELS")
-
- def update_models():
- models_path, index_path = upload_model_list()
-
- dict_models = {
- f"fmodel{i:02d}": gr.update(
- choices=models_path
- )
- for i in range(MAX_TTS+1)
- }
- dict_index = {
- f"findex{i:02d}": gr.update(
- choices=index_path, value=None
- )
- for i in range(MAX_TTS+1)
- }
- dict_changes = {**dict_models, **dict_index}
- return [value for value in dict_changes.values()]
-
- with gr.Column():
- with gr.Accordion(lg_conf["replace_title"], open=False):
- with gr.Column(variant="compact"):
- with gr.Column():
- gr.Markdown(lg_conf["sec1_title"])
- enable_custom_voice = gr.Checkbox(
- False,
- label="ENABLE",
- info=lg_conf["enable_replace"]
- )
- workers_custom_voice = gr.Number(
- step=1,
- value=1,
- minimum=1,
- maximum=50,
- label="workers",
- visible=False,
- )
-
- gr.Markdown(lg_conf["sec2_title"])
- gr.Markdown(lg_conf["sec2_subtitle"])
-
- PITCH_ALGO_OPT = [
- "pm",
- "harvest",
- "crepe",
- "rmvpe",
- "rmvpe+",
- ]
-
- def model_conf():
- return gr.Dropdown(
- models_path,
- # value="",
- label="Model",
- visible=True,
- interactive=True,
- )
-
- def pitch_algo_conf():
- return gr.Dropdown(
- PITCH_ALGO_OPT,
- value=PITCH_ALGO_OPT[3],
- label="Pitch algorithm",
- visible=True,
- interactive=True,
- )
-
- def pitch_lvl_conf():
- return gr.Slider(
- label="Pitch level",
- minimum=-24,
- maximum=24,
- step=1,
- value=0,
- visible=True,
- interactive=True,
- )
-
- def index_conf():
- return gr.Dropdown(
- index_path,
- value=None,
- label="Index",
- visible=True,
- interactive=True,
- )
-
- def index_inf_conf():
- return gr.Slider(
- minimum=0,
- maximum=1,
- label="Index influence",
- value=0.75,
- )
-
- def respiration_filter_conf():
- return gr.Slider(
- minimum=0,
- maximum=7,
- label="Respiration median filtering",
- value=3,
- step=1,
- interactive=True,
- )
-
- def envelope_ratio_conf():
- return gr.Slider(
- minimum=0,
- maximum=1,
- label="Envelope ratio",
- value=0.25,
- interactive=True,
- )
-
- def consonant_protec_conf():
- return gr.Slider(
- minimum=0,
- maximum=0.5,
- label="Consonant breath protection",
- value=0.5,
- interactive=True,
- )
-
- def button_conf(tts_name):
- return gr.Button(
- lg_conf["cv_button_apply"]+" "+tts_name,
- variant="primary",
- )
-
- TTS_TABS = [
- 'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1)
- ]
-
- CV_SUBTITLES = [
- lg_conf["cv_tts1"],
- lg_conf["cv_tts2"],
- lg_conf["cv_tts3"],
- lg_conf["cv_tts4"],
- lg_conf["cv_tts5"],
- lg_conf["cv_tts6"],
- lg_conf["cv_tts7"],
- lg_conf["cv_tts8"],
- lg_conf["cv_tts9"],
- lg_conf["cv_tts10"],
- lg_conf["cv_tts11"],
- lg_conf["cv_tts12"],
- ]
-
- configs_storage = []
-
- for i in range(MAX_TTS): # Loop from 00 to 11
- with gr.Accordion(CV_SUBTITLES[i], open=False):
- gr.Markdown(TTS_TABS[i])
- with gr.Column():
- tag_gui = gr.Textbox(
- value=TTS_TABS[i], visible=False
- )
- model_gui = model_conf()
- pitch_algo_gui = pitch_algo_conf()
- pitch_lvl_gui = pitch_lvl_conf()
- index_gui = index_conf()
- index_inf_gui = index_inf_conf()
- rmf_gui = respiration_filter_conf()
- er_gui = envelope_ratio_conf()
- cbp_gui = consonant_protec_conf()
-
- with gr.Row(variant="compact"):
- button_config = button_conf(
- TTS_TABS[i]
- )
-
- confirm_conf = gr.HTML()
-
- button_config.click(
- SoniTr.vci.apply_conf,
- inputs=[
- tag_gui,
- model_gui,
- pitch_algo_gui,
- pitch_lvl_gui,
- index_gui,
- index_inf_gui,
- rmf_gui,
- er_gui,
- cbp_gui,
- ],
- outputs=[confirm_conf],
- )
-
- configs_storage.append({
- "tag": tag_gui,
- "model": model_gui,
- "index": index_gui,
- })
-
- with gr.Column():
- with gr.Accordion("Test R.V.C.", open=False):
- with gr.Row(variant="compact"):
- text_test = gr.Textbox(
- label="Text",
- value="This is an example",
- info="write a text",
- placeholder="...",
- lines=5,
- )
- with gr.Column():
- tts_test = gr.Dropdown(
- sorted(SoniTr.tts_info.list_edge),
- value="en-GB-ThomasNeural-Male",
- label="TTS",
- visible=True,
- interactive=True,
- )
- model_test = model_conf()
- index_test = index_conf()
- pitch_test = pitch_lvl_conf()
- pitch_alg_test = pitch_algo_conf()
- with gr.Row(variant="compact"):
- button_test = gr.Button("Test audio")
-
- with gr.Column():
- with gr.Row():
- original_ttsvoice = gr.Audio()
- ttsvoice = gr.Audio()
-
- button_test.click(
- SoniTr.vci.make_test,
- inputs=[
- text_test,
- tts_test,
- model_test,
- index_test,
- pitch_test,
- pitch_alg_test,
- ],
- outputs=[ttsvoice, original_ttsvoice],
- )
-
- download_button.click(
- download_list,
- [url_links],
- [download_finish],
- queue=False
- ).then(
- update_models,
- [],
- [
- elem["model"] for elem in configs_storage
- ] + [model_test] + [
- elem["index"] for elem in configs_storage
- ] + [index_test],
- )
-
- with gr.Tab(lg_conf["tab_help"]):
- gr.Markdown(lg_conf["tutorial"])
- gr.Markdown(news)
-
- def play_sound_alert(play_sound):
-
- if not play_sound:
- return None
-
- # silent_sound = "assets/empty_audio.mp3"
- sound_alert = "assets/sound_alert.mp3"
-
- time.sleep(0.25)
- # yield silent_sound
- yield None
-
- time.sleep(0.25)
- yield sound_alert
-
- sound_alert_notification = gr.Audio(
- value=None,
- type="filepath",
- format="mp3",
- autoplay=True,
- visible=False,
- )
-
- if logs_in_gui:
- logger.info("Logs in gui need public url")
-
- class Logger:
- def __init__(self, filename):
- self.terminal = sys.stdout
- self.log = open(filename, "w")
-
- def write(self, message):
- self.terminal.write(message)
- self.log.write(message)
-
- def flush(self):
- self.terminal.flush()
- self.log.flush()
-
- def isatty(self):
- return False
-
- sys.stdout = Logger("output.log")
-
- def read_logs():
- sys.stdout.flush()
- with open("output.log", "r") as f:
- return f.read()
-
- with gr.Accordion("Logs", open=False):
- logs = gr.Textbox(label=">>>")
- app.load(read_logs, None, logs, every=1)
-
- if SoniTr.tts_info.xtts_enabled:
- # Update tts list
- def update_tts_list():
- update_dict = {
- f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list())
- for i in range(MAX_TTS)
- }
- update_dict["tts_documents"] = gr.update(
- choices=list(
- filter(
- lambda x: x != "_XTTS_/AUTOMATIC.wav",
- SoniTr.tts_info.tts_list(),
- )
- )
- )
- return [value for value in update_dict.values()]
-
- create_xtts_wav.click(
- create_wav_file_vc,
- inputs=[
- wav_speaker_name,
- wav_speaker_file,
- wav_speaker_start,
- wav_speaker_end,
- wav_speaker_dir,
- wav_speaker_dereverb,
- ],
- outputs=[wav_speaker_output],
- ).then(
- update_tts_list,
- None,
- [
- tts_voice00,
- tts_voice01,
- tts_voice02,
- tts_voice03,
- tts_voice04,
- tts_voice05,
- tts_voice06,
- tts_voice07,
- tts_voice08,
- tts_voice09,
- tts_voice10,
- tts_voice11,
- tts_documents,
- ],
- )
-
- # Run translate text
- subs_button.click(
- SoniTr.batch_multilingual_media_conversion,
- inputs=[
- video_input,
- blink_input,
- directory_input,
- HFKEY,
- PREVIEW,
- WHISPER_MODEL_SIZE,
- batch_size,
- compute_type,
- SOURCE_LANGUAGE,
- TRANSLATE_AUDIO_TO,
- min_speakers,
- max_speakers,
- tts_voice00,
- tts_voice01,
- tts_voice02,
- tts_voice03,
- tts_voice04,
- tts_voice05,
- tts_voice06,
- tts_voice07,
- tts_voice08,
- tts_voice09,
- tts_voice10,
- tts_voice11,
- VIDEO_OUTPUT_NAME,
- AUDIO_MIX,
- audio_accelerate,
- acceleration_rate_regulation_gui,
- volume_original_mix,
- volume_translated_mix,
- sub_type_output,
- edit_sub_check, # TRUE BY DEFAULT
- dummy_false_check, # dummy false
- subs_edit_space,
- avoid_overlap_gui,
- vocal_refinement_gui,
- literalize_numbers_gui,
- segment_duration_limit_gui,
- diarization_process_dropdown,
- translate_process_dropdown,
- input_srt,
- main_output_type,
- main_voiceless_track,
- voice_imitation_gui,
- voice_imitation_max_segments_gui,
- voice_imitation_vocals_dereverb_gui,
- voice_imitation_remove_previous_gui,
- voice_imitation_method_gui,
- wav_speaker_dereverb,
- text_segmentation_scale_gui,
- divide_text_segments_by_gui,
- soft_subtitles_to_video_gui,
- burn_subtitles_to_video_gui,
- enable_cache_gui,
- enable_custom_voice,
- workers_custom_voice,
- is_gui_dummy_check,
- ],
- outputs=subs_edit_space,
- ).then(
- play_sound_alert, [play_sound_gui], [sound_alert_notification]
- )
-
- # Run translate tts and complete
- video_button.click(
- SoniTr.batch_multilingual_media_conversion,
- inputs=[
- video_input,
- blink_input,
- directory_input,
- HFKEY,
- PREVIEW,
- WHISPER_MODEL_SIZE,
- batch_size,
- compute_type,
- SOURCE_LANGUAGE,
- TRANSLATE_AUDIO_TO,
- min_speakers,
- max_speakers,
- tts_voice00,
- tts_voice01,
- tts_voice02,
- tts_voice03,
- tts_voice04,
- tts_voice05,
- tts_voice06,
- tts_voice07,
- tts_voice08,
- tts_voice09,
- tts_voice10,
- tts_voice11,
- VIDEO_OUTPUT_NAME,
- AUDIO_MIX,
- audio_accelerate,
- acceleration_rate_regulation_gui,
- volume_original_mix,
- volume_translated_mix,
- sub_type_output,
- dummy_false_check,
- edit_sub_check,
- subs_edit_space,
- avoid_overlap_gui,
- vocal_refinement_gui,
- literalize_numbers_gui,
- segment_duration_limit_gui,
- diarization_process_dropdown,
- translate_process_dropdown,
- input_srt,
- main_output_type,
- main_voiceless_track,
- voice_imitation_gui,
- voice_imitation_max_segments_gui,
- voice_imitation_vocals_dereverb_gui,
- voice_imitation_remove_previous_gui,
- voice_imitation_method_gui,
- wav_speaker_dereverb,
- text_segmentation_scale_gui,
- divide_text_segments_by_gui,
- soft_subtitles_to_video_gui,
- burn_subtitles_to_video_gui,
- enable_cache_gui,
- enable_custom_voice,
- workers_custom_voice,
- is_gui_dummy_check,
- ],
- outputs=video_output,
- trigger_mode="multiple",
- ).then(
- play_sound_alert, [play_sound_gui], [sound_alert_notification]
- )
-
- # Run docs process
- docs_button.click(
- SoniTr.multilingual_docs_conversion,
- inputs=[
- text_docs,
- input_docs,
- directory_input_docs,
- docs_SOURCE_LANGUAGE,
- docs_TRANSLATE_TO,
- tts_documents,
- docs_OUTPUT_NAME,
- docs_translate_process_dropdown,
- docs_output_type,
- docs_chunk_size,
- enable_custom_voice,
- workers_custom_voice,
- start_page_gui,
- end_page_gui,
- videobook_width_gui,
- videobook_height_gui,
- videobook_bcolor_gui,
- docs_dummy_check,
- ],
- outputs=docs_output,
- trigger_mode="multiple",
- ).then(
- play_sound_alert, [play_sound_gui], [sound_alert_notification]
- )
-
- return app
-
-
-def get_language_config(language_data, language=None, base_key="english"):
- base_lang = language_data.get(base_key)
-
- if language not in language_data:
- logger.error(
- f"Language {language} not found, defaulting to {base_key}"
- )
- return base_lang
-
- lg_conf = language_data.get(language, {})
- lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf)
-
- return lg_conf
-
-
-def create_parser():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
- parser.add_argument(
- "--theme",
- type=str,
- default="Taithrah/Minimal",
- help=(
- "Specify the theme; find themes in "
- "https://huggingface.co/spaces/gradio/theme-gallery;"
- " Example: --theme aliabid94/new-theme"
- ),
- )
- parser.add_argument(
- "--public_url",
- action="store_true",
- default=False,
- help="Enable public link",
- )
- parser.add_argument(
- "--logs_in_gui",
- action="store_true",
- default=False,
- help="Displays the operations performed in Logs",
- )
- parser.add_argument(
- "--verbosity_level",
- type=str,
- default="info",
- help=(
- "Set logger verbosity level: "
- "debug, info, warning, error, or critical"
- ),
- )
- parser.add_argument(
- "--language",
- type=str,
- default="english",
- help=" Select the language of the interface: english, spanish",
- )
- parser.add_argument(
- "--cpu_mode",
- action="store_true",
- default=False,
- help="Enable CPU mode to run the program without utilizing GPU acceleration.",
- )
- return parser
-
-
-if __name__ == "__main__":
-
- parser = create_parser()
-
- args = parser.parse_args()
- # Simulating command-line arguments
- # args_list = "--theme aliabid94/new-theme --public_url".split()
- # args = parser.parse_args(args_list)
-
- set_logging_level(args.verbosity_level)
-
- for id_model in UVR_MODELS:
- download_manager(
- os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
- )
-
- models_path, index_path = upload_model_list()
-
- SoniTr = SoniTranslate(cpu_mode=args.cpu_mode if os.environ.get("ZERO_GPU") != "TRUE" else "cpu")
-
- lg_conf = get_language_config(language_data, language=args.language)
-
- app = create_gui(args.theme, logs_in_gui=args.logs_in_gui)
-
- app.queue()
-
- app.launch(
- max_threads=1,
- share=args.public_url,
- show_error=True,
- quiet=False,
- debug=(True if logger.isEnabledFor(logging.DEBUG) else False),
- )
+import gradio as gr
+from soni_translate.logging_setup import (
+ logger,
+ set_logging_level,
+ configure_logging_libs,
+); configure_logging_libs() # noqa
+import whisperx
+import torch
+import os
+from soni_translate.audio_segments import create_translated_audio
+from soni_translate.text_to_speech import (
+ audio_segmentation_to_voice,
+ edge_tts_voices_list,
+ coqui_xtts_voices_list,
+ piper_tts_voices_list,
+ create_wav_file_vc,
+ accelerate_segments,
+)
+from soni_translate.translate_segments import (
+ translate_text,
+ TRANSLATION_PROCESS_OPTIONS,
+ DOCS_TRANSLATION_PROCESS_OPTIONS
+)
+from soni_translate.preprocessor import (
+ audio_video_preprocessor,
+ audio_preprocessor,
+)
+from soni_translate.postprocessor import (
+ OUTPUT_TYPE_OPTIONS,
+ DOCS_OUTPUT_TYPE_OPTIONS,
+ sound_separate,
+ get_no_ext_filename,
+ media_out,
+ get_subtitle_speaker,
+)
+from soni_translate.language_configuration import (
+ LANGUAGES,
+ UNIDIRECTIONAL_L_LIST,
+ LANGUAGES_LIST,
+ BARK_VOICES_LIST,
+ VITS_VOICES_LIST,
+ OPENAI_TTS_MODELS,
+)
+from soni_translate.utils import (
+ remove_files,
+ download_list,
+ upload_model_list,
+ download_manager,
+ run_command,
+ is_audio_file,
+ is_subtitle_file,
+ copy_files,
+ get_valid_files,
+ get_link_list,
+ remove_directory_contents,
+)
+from soni_translate.mdx_net import (
+ UVR_MODELS,
+ MDX_DOWNLOAD_LINK,
+ mdxnet_models_dir,
+)
+from soni_translate.speech_segmentation import (
+ ASR_MODEL_OPTIONS,
+ COMPUTE_TYPE_GPU,
+ COMPUTE_TYPE_CPU,
+ find_whisper_models,
+ transcribe_speech,
+ align_speech,
+ diarize_speech,
+ diarization_models,
+)
+from soni_translate.text_multiformat_processor import (
+ BORDER_COLORS,
+ srt_file_to_segments,
+ document_preprocessor,
+ determine_chunk_size,
+ plain_text_to_segments,
+ segments_to_plain_text,
+ process_subtitles,
+ linguistic_level_segments,
+ break_aling_segments,
+ doc_to_txtximg_pages,
+ page_data_to_segments,
+ update_page_data,
+ fix_timestamps_docs,
+ create_video_from_images,
+ merge_video_and_audio,
+)
+from soni_translate.languages_gui import language_data, news
+import copy
+import logging
+import json
+from pydub import AudioSegment
+from voice_main import ClassVoices
+import argparse
+import time
+import hashlib
+import sys
+
+directories = [
+ "downloads",
+ "logs",
+ "weights",
+ "clean_song_output",
+ "_XTTS_",
+ f"audio2{os.sep}audio",
+ "audio",
+ "outputs",
+]
+[
+ os.makedirs(directory)
+ for directory in directories
+ if not os.path.exists(directory)
+]
+
+
+class TTS_Info:
+ def __init__(self, piper_enabled, xtts_enabled):
+ self.list_edge = edge_tts_voices_list()
+ self.list_bark = list(BARK_VOICES_LIST.keys())
+ self.list_vits = list(VITS_VOICES_LIST.keys())
+ self.list_openai_tts = OPENAI_TTS_MODELS
+ self.piper_enabled = piper_enabled
+ self.list_vits_onnx = (
+ piper_tts_voices_list() if self.piper_enabled else []
+ )
+ self.xtts_enabled = xtts_enabled
+
+ def tts_list(self):
+ self.list_coqui_xtts = (
+ coqui_xtts_voices_list() if self.xtts_enabled else []
+ )
+ list_tts = self.list_coqui_xtts + sorted(
+ self.list_edge
+ + self.list_bark
+ + self.list_vits
+ + self.list_openai_tts
+ + self.list_vits_onnx
+ )
+ return list_tts
+
+
+def prog_disp(msg, percent, is_gui, progress=None):
+ logger.info(msg)
+ if is_gui:
+ progress(percent, desc=msg)
+
+
+def warn_disp(wrn_lang, is_gui):
+ logger.warning(wrn_lang)
+ if is_gui:
+ gr.Warning(wrn_lang)
+
+
+class SoniTrCache:
+ def __init__(self):
+ self.cache = {
+ 'media': [[]],
+ 'refine_vocals': [],
+ 'transcript_align': [],
+ 'break_align': [],
+ 'diarize': [],
+ 'translate': [],
+ 'subs_and_edit': [],
+ 'tts': [],
+ 'acc_and_vc': [],
+ 'mix_aud': [],
+ 'output': []
+ }
+
+ self.cache_data = {
+ 'media': [],
+ 'refine_vocals': [],
+ 'transcript_align': [],
+ 'break_align': [],
+ 'diarize': [],
+ 'translate': [],
+ 'subs_and_edit': [],
+ 'tts': [],
+ 'acc_and_vc': [],
+ 'mix_aud': [],
+ 'output': []
+ }
+
+ self.cache_keys = list(self.cache.keys())
+ self.first_task = self.cache_keys[0]
+ self.last_task = self.cache_keys[-1]
+
+ self.pre_step = None
+ self.pre_params = []
+
+ def set_variable(self, variable_name, value):
+ setattr(self, variable_name, value)
+
+ def task_in_cache(self, step: str, params: list, previous_step_data: dict):
+
+ self.pre_step_cache = None
+
+ if step == self.first_task:
+ self.pre_step = None
+
+ if self.pre_step:
+ self.cache[self.pre_step] = self.pre_params
+
+ # Fill data in cache
+ self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data)
+
+ self.pre_params = params
+ # logger.debug(f"Step: {str(step)}, Cache params: {str(self.cache)}")
+ if params == self.cache[step]:
+ logger.debug(f"In cache: {str(step)}")
+
+ # Set the var needed for next step
+ # Recovery from cache_data the current step
+ for key, value in self.cache_data[step].items():
+ self.set_variable(key, copy.deepcopy(value))
+ logger.debug(
+ f"Chache load: {str(key)}"
+ )
+
+ self.pre_step = step
+ return True
+
+ else:
+ logger.debug(f"Flush next and caching {str(step)}")
+ selected_index = self.cache_keys.index(step)
+
+ for idx, key in enumerate(self.cache.keys()):
+ if idx >= selected_index:
+ self.cache[key] = []
+ self.cache_data[key] = {}
+
+ # The last is now previous
+ self.pre_step = step
+ return False
+
+ def clear_cache(self, media, force=False):
+
+ self.cache["media"] = (
+ self.cache["media"] if len(self.cache["media"]) else [[]]
+ )
+
+ if media != self.cache["media"][0] or force:
+
+ # Clear cache
+ self.cache = {key: [] for key in self.cache}
+ self.cache["media"] = [[]]
+
+ logger.info("Cache flushed")
+
+
+def get_hash(filepath):
+ with open(filepath, 'rb') as f:
+ file_hash = hashlib.blake2b()
+ while chunk := f.read(8192):
+ file_hash.update(chunk)
+
+ return file_hash.hexdigest()[:18]
+
+
+def check_openai_api_key():
+ if not os.environ.get("OPENAI_API_KEY"):
+ raise ValueError(
+ "To use GPT for translation, please set up your OpenAI API key "
+ "as an environment variable in Linux as follows: "
+ "export OPENAI_API_KEY='your-api-key-here'. Or change the "
+ "translation process in Advanced settings."
+ )
+
+
+class SoniTranslate(SoniTrCache):
+ def __init__(self, cpu_mode=False):
+ super().__init__()
+ if cpu_mode:
+ os.environ["SONITR_DEVICE"] = "cpu"
+ else:
+ os.environ["SONITR_DEVICE"] = (
+ "cuda" if torch.cuda.is_available() else "cpu"
+ )
+
+ self.device = os.environ.get("SONITR_DEVICE")
+ self.result_diarize = None
+ self.align_language = None
+ self.result_source_lang = None
+ self.edit_subs_complete = False
+ self.voiceless_id = None
+ self.burn_subs_id = None
+
+ self.vci = ClassVoices(only_cpu=cpu_mode)
+
+ self.tts_voices = self.get_tts_voice_list()
+
+ logger.info(f"Working in: {self.device}")
+
+ def get_tts_voice_list(self):
+ try:
+ from piper import PiperVoice # noqa
+
+ piper_enabled = True
+ logger.info("PIPER TTS enabled")
+ except Exception as error:
+ logger.debug(str(error))
+ piper_enabled = False
+ logger.info("PIPER TTS disabled")
+ try:
+ from TTS.api import TTS # noqa
+
+ xtts_enabled = True
+ logger.info("Coqui XTTS enabled")
+ logger.info(
+ "In this app, by using Coqui TTS (text-to-speech), you "
+ "acknowledge and agree to the license.\n"
+ "You confirm that you have read, understood, and agreed "
+ "to the Terms and Conditions specified at the following "
+ "link:\nhttps://coqui.ai/cpml.txt."
+ )
+ os.environ["COQUI_TOS_AGREED"] = "1"
+ except Exception as error:
+ logger.debug(str(error))
+ xtts_enabled = False
+ logger.info("Coqui XTTS disabled")
+
+ self.tts_info = TTS_Info(piper_enabled, xtts_enabled)
+
+ return self.tts_info.tts_list()
+
+ def batch_multilingual_media_conversion(self, *kwargs):
+ # logger.debug(str(kwargs))
+
+ media_file_arg = kwargs[0] if kwargs[0] is not None else []
+
+ link_media_arg = kwargs[1]
+ link_media_arg = [x.strip() for x in link_media_arg.split(',')]
+ link_media_arg = get_link_list(link_media_arg)
+
+ path_arg = kwargs[2]
+ path_arg = [x.strip() for x in path_arg.split(',')]
+ path_arg = get_valid_files(path_arg)
+
+ edit_text_arg = kwargs[31]
+ get_text_arg = kwargs[32]
+
+ is_gui_arg = kwargs[-1]
+
+ kwargs = kwargs[3:]
+
+ media_batch = media_file_arg + link_media_arg + path_arg
+ media_batch = list(filter(lambda x: x != "", media_batch))
+ media_batch = media_batch if media_batch else [None]
+ logger.debug(str(media_batch))
+
+ remove_directory_contents("outputs")
+
+ if edit_text_arg or get_text_arg:
+ return self.multilingual_media_conversion(
+ media_batch[0], "", "", *kwargs
+ )
+
+ if "SET_LIMIT" == os.getenv("DEMO"):
+ media_batch = [media_batch[0]]
+
+ result = []
+ for media in media_batch:
+ # Call the nested function with the parameters
+ output_file = self.multilingual_media_conversion(
+ media, "", "", *kwargs
+ )
+
+ if isinstance(output_file, str):
+ output_file = [output_file]
+ result.extend(output_file)
+
+ if is_gui_arg and len(media_batch) > 1:
+ gr.Info(f"Done: {os.path.basename(output_file[0])}")
+
+ return result
+
+ def multilingual_media_conversion(
+ self,
+ media_file=None,
+ link_media="",
+ directory_input="",
+ YOUR_HF_TOKEN="",
+ preview=False,
+ transcriber_model="large-v3",
+ batch_size=4,
+ compute_type="auto",
+ origin_language="Automatic detection",
+ target_language="English (en)",
+ min_speakers=1,
+ max_speakers=1,
+ tts_voice00="en-US-EmmaMultilingualNeural-Female",
+ tts_voice01="en-US-AndrewMultilingualNeural-Male",
+ tts_voice02="en-US-AvaMultilingualNeural-Female",
+ tts_voice03="en-US-BrianMultilingualNeural-Male",
+ tts_voice04="de-DE-SeraphinaMultilingualNeural-Female",
+ tts_voice05="de-DE-FlorianMultilingualNeural-Male",
+ tts_voice06="fr-FR-VivienneMultilingualNeural-Female",
+ tts_voice07="fr-FR-RemyMultilingualNeural-Male",
+ tts_voice08="en-US-EmmaMultilingualNeural-Female",
+ tts_voice09="en-US-AndrewMultilingualNeural-Male",
+ tts_voice10="en-US-EmmaMultilingualNeural-Female",
+ tts_voice11="en-US-AndrewMultilingualNeural-Male",
+ video_output_name="",
+ mix_method_audio="Adjusting volumes and mixing audio",
+ max_accelerate_audio=2.1,
+ acceleration_rate_regulation=False,
+ volume_original_audio=0.25,
+ volume_translated_audio=1.80,
+ output_format_subtitle="srt",
+ get_translated_text=False,
+ get_video_from_text_json=False,
+ text_json="{}",
+ avoid_overlap=False,
+ vocal_refinement=False,
+ literalize_numbers=True,
+ segment_duration_limit=15,
+ diarization_model="pyannote_2.1",
+ translate_process="google_translator_batch",
+ subtitle_file=None,
+ output_type="video (mp4)",
+ voiceless_track=False,
+ voice_imitation=False,
+ voice_imitation_max_segments=3,
+ voice_imitation_vocals_dereverb=False,
+ voice_imitation_remove_previous=True,
+ voice_imitation_method="freevc",
+ dereverb_automatic_xtts=True,
+ text_segmentation_scale="sentence",
+ divide_text_segments_by="",
+ soft_subtitles_to_video=True,
+ burn_subtitles_to_video=False,
+ enable_cache=True,
+ custom_voices=False,
+ custom_voices_workers=1,
+ is_gui=False,
+ progress=gr.Progress(),
+ ):
+ if not YOUR_HF_TOKEN:
+ YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
+ if diarization_model == "disable" or max_speakers == 1:
+ if YOUR_HF_TOKEN is None:
+ YOUR_HF_TOKEN = ""
+ elif not YOUR_HF_TOKEN:
+ raise ValueError("No valid Hugging Face token")
+ else:
+ os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN
+
+ if (
+ "gpt" in translate_process
+ or transcriber_model == "OpenAI_API_Whisper"
+ or "OpenAI-TTS" in tts_voice00
+ ):
+ check_openai_api_key()
+
+ if media_file is None:
+ media_file = (
+ directory_input
+ if os.path.exists(directory_input)
+ else link_media
+ )
+ media_file = (
+ media_file if isinstance(media_file, str) else media_file.name
+ )
+
+ if is_subtitle_file(media_file):
+ subtitle_file = media_file
+ media_file = ""
+
+ if media_file is None:
+ media_file = ""
+
+ if not origin_language:
+ origin_language = "Automatic detection"
+
+ if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file:
+ raise ValueError(
+ f"The language '{origin_language}' "
+ "is not supported for transcription (ASR)."
+ )
+
+ if get_translated_text:
+ self.edit_subs_complete = False
+ if get_video_from_text_json:
+ if not self.edit_subs_complete:
+ raise ValueError("Generate the transcription first.")
+
+ if (
+ ("sound" in output_type or output_type == "raw media")
+ and (get_translated_text or get_video_from_text_json)
+ ):
+ raise ValueError(
+ "Please disable 'edit generate subtitles' "
+ f"first to acquire the {output_type}."
+ )
+
+ TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
+ SOURCE_LANGUAGE = LANGUAGES[origin_language]
+
+ if (
+ transcriber_model == "OpenAI_API_Whisper"
+ and SOURCE_LANGUAGE == "zh-TW"
+ ):
+ logger.warning(
+ "OpenAI API Whisper only supports Chinese (Simplified)."
+ )
+ SOURCE_LANGUAGE = "zh"
+
+ if (
+ text_segmentation_scale in ["word", "character"]
+ and "subtitle" not in output_type
+ ):
+ wrn_lang = (
+ "Text segmentation by words or characters is typically"
+ " used for generating subtitles. If subtitles are not the"
+ " intended output, consider selecting 'sentence' "
+ "segmentation method to ensure optimal results."
+
+ )
+ warn_disp(wrn_lang, is_gui)
+
+ if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
+ wrn_lang = (
+ "Make sure to select a 'TTS Speaker' suitable for"
+ " the translation language to avoid errors with the TTS."
+ )
+ warn_disp(wrn_lang, is_gui)
+
+ if "_XTTS_" in tts_voice00 and voice_imitation:
+ wrn_lang = (
+ "When you select XTTS, it is advisable "
+ "to disable Voice Imitation."
+ )
+ warn_disp(wrn_lang, is_gui)
+
+ if custom_voices and voice_imitation:
+ wrn_lang = (
+ "When you use R.V.C. models, it is advisable"
+ " to disable Voice Imitation."
+ )
+ warn_disp(wrn_lang, is_gui)
+
+ if not media_file and not subtitle_file:
+ raise ValueError(
+ "Specifify a media or SRT file in advanced settings"
+ )
+
+ if subtitle_file:
+ subtitle_file = (
+ subtitle_file
+ if isinstance(subtitle_file, str)
+ else subtitle_file.name
+ )
+
+ if subtitle_file and SOURCE_LANGUAGE == "Automatic detection":
+ raise Exception(
+ "To use an SRT file, you need to specify its "
+ "original language (Source language)"
+ )
+
+ if not media_file and subtitle_file:
+ diarization_model = "disable"
+ media_file = "audio_support.wav"
+ if not get_video_from_text_json:
+ remove_files(media_file)
+ srt_data = srt_file_to_segments(subtitle_file)
+ total_duration = srt_data["segments"][-1]["end"] + 30.
+ support_audio = AudioSegment.silent(
+ duration=int(total_duration * 1000)
+ )
+ support_audio.export(
+ media_file, format="wav"
+ )
+ logger.info("Supporting audio for the SRT file, created.")
+
+ if "SET_LIMIT" == os.getenv("DEMO"):
+ preview = True
+ mix_method_audio = "Adjusting volumes and mixing audio"
+ transcriber_model = "medium"
+ logger.info(
+ "DEMO; set preview=True; Generation is limited to "
+ "10 seconds to prevent CPU errors. No limitations with GPU.\n"
+ "DEMO; set Adjusting volumes and mixing audio\n"
+ "DEMO; set whisper model to medium"
+ )
+
+ # Check GPU
+ if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU:
+ logger.info("Compute type changed to float32")
+ compute_type = "float32"
+
+ base_video_file = "Video.mp4"
+ base_audio_wav = "audio.wav"
+ dub_audio_file = "audio_dub_solo.ogg"
+ vocals_audio_file = "audio_Vocals_DeReverb.wav"
+ voiceless_audio_file = "audio_Voiceless.wav"
+ mix_audio_file = "audio_mix.mp3"
+ vid_subs = "video_subs_file.mp4"
+ video_output_file = "video_dub.mp4"
+
+ if os.path.exists(media_file):
+ media_base_hash = get_hash(media_file)
+ else:
+ media_base_hash = media_file
+ self.clear_cache(media_base_hash, force=(not enable_cache))
+
+ if not get_video_from_text_json:
+ self.result_diarize = (
+ self.align_language
+ ) = self.result_source_lang = None
+ if not self.task_in_cache("media", [media_base_hash, preview], {}):
+ if is_audio_file(media_file):
+ prog_disp(
+ "Processing audio...", 0.15, is_gui, progress=progress
+ )
+ audio_preprocessor(preview, media_file, base_audio_wav)
+ else:
+ prog_disp(
+ "Processing video...", 0.15, is_gui, progress=progress
+ )
+ audio_video_preprocessor(
+ preview, media_file, base_video_file, base_audio_wav
+ )
+ logger.debug("Set file complete.")
+
+ if "sound" in output_type:
+ prog_disp(
+ "Separating sounds in the file...",
+ 0.50,
+ is_gui,
+ progress=progress
+ )
+ separate_out = sound_separate(base_audio_wav, output_type)
+ final_outputs = []
+ for out in separate_out:
+ final_name = media_out(
+ media_file,
+ f"{get_no_ext_filename(out)}",
+ video_output_name,
+ "wav",
+ file_obj=out,
+ )
+ final_outputs.append(final_name)
+ logger.info(f"Done: {str(final_outputs)}")
+ return final_outputs
+
+ if output_type == "raw media":
+ output = media_out(
+ media_file,
+ "raw_media",
+ video_output_name,
+ "wav" if is_audio_file(media_file) else "mp4",
+ file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
+ )
+ logger.info(f"Done: {output}")
+ return output
+
+ if not self.task_in_cache("refine_vocals", [vocal_refinement], {}):
+ self.vocals = None
+ if vocal_refinement:
+ try:
+ from soni_translate.mdx_net import process_uvr_task
+ _, _, _, _, file_vocals = process_uvr_task(
+ orig_song_path=base_audio_wav,
+ main_vocals=False,
+ dereverb=True,
+ remove_files_output_dir=True,
+ )
+ remove_files(vocals_audio_file)
+ copy_files(file_vocals, ".")
+ self.vocals = vocals_audio_file
+ except Exception as error:
+ logger.error(str(error))
+
+ if not self.task_in_cache("transcript_align", [
+ subtitle_file,
+ SOURCE_LANGUAGE,
+ transcriber_model,
+ compute_type,
+ batch_size,
+ literalize_numbers,
+ segment_duration_limit,
+ (
+ "l_unit"
+ if text_segmentation_scale in ["word", "character"]
+ and subtitle_file
+ else "sentence"
+ )
+ ], {"vocals": self.vocals}):
+ if subtitle_file:
+ prog_disp(
+ "From SRT file...", 0.30, is_gui, progress=progress
+ )
+ audio = whisperx.load_audio(
+ base_audio_wav if not self.vocals else self.vocals
+ )
+ self.result = srt_file_to_segments(subtitle_file)
+ self.result["language"] = SOURCE_LANGUAGE
+ else:
+ prog_disp(
+ "Transcribing...", 0.30, is_gui, progress=progress
+ )
+ SOURCE_LANGUAGE = (
+ None
+ if SOURCE_LANGUAGE == "Automatic detection"
+ else SOURCE_LANGUAGE
+ )
+ audio, self.result = transcribe_speech(
+ base_audio_wav if not self.vocals else self.vocals,
+ transcriber_model,
+ compute_type,
+ batch_size,
+ SOURCE_LANGUAGE,
+ literalize_numbers,
+ segment_duration_limit,
+ )
+ logger.debug(
+ "Transcript complete, "
+ f"segments count {len(self.result['segments'])}"
+ )
+
+ self.align_language = self.result["language"]
+ if (
+ not subtitle_file
+ or text_segmentation_scale in ["word", "character"]
+ ):
+ prog_disp("Aligning...", 0.45, is_gui, progress=progress)
+ try:
+ if self.align_language in ["vi"]:
+ logger.info(
+ "Deficient alignment for the "
+ f"{self.align_language} language, skipping the"
+ " process. It is suggested to reduce the "
+ "duration of the segments as an alternative."
+ )
+ else:
+ self.result = align_speech(audio, self.result)
+ logger.debug(
+ "Align complete, "
+ f"segments count {len(self.result['segments'])}"
+ )
+ except Exception as error:
+ logger.error(str(error))
+
+ if self.result["segments"] == []:
+ raise ValueError("No active speech found in audio")
+
+ if not self.task_in_cache("break_align", [
+ divide_text_segments_by,
+ text_segmentation_scale,
+ self.align_language
+ ], {
+ "result": self.result,
+ "align_language": self.align_language
+ }):
+ if self.align_language in ["ja", "zh", "zh-TW"]:
+ divide_text_segments_by += "|!|?|...|。"
+ if text_segmentation_scale in ["word", "character"]:
+ self.result = linguistic_level_segments(
+ self.result,
+ text_segmentation_scale,
+ )
+ elif divide_text_segments_by:
+ try:
+ self.result = break_aling_segments(
+ self.result,
+ break_characters=divide_text_segments_by,
+ )
+ except Exception as error:
+ logger.error(str(error))
+
+ if not self.task_in_cache("diarize", [
+ min_speakers,
+ max_speakers,
+ YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2],
+ diarization_model
+ ], {
+ "result": self.result
+ }):
+ prog_disp("Diarizing...", 0.60, is_gui, progress=progress)
+ diarize_model_select = diarization_models[diarization_model]
+ self.result_diarize = diarize_speech(
+ base_audio_wav if not self.vocals else self.vocals,
+ self.result,
+ min_speakers,
+ max_speakers,
+ YOUR_HF_TOKEN,
+ diarize_model_select,
+ )
+ logger.debug("Diarize complete")
+ self.result_source_lang = copy.deepcopy(self.result_diarize)
+
+ if not self.task_in_cache("translate", [
+ TRANSLATE_AUDIO_TO,
+ translate_process
+ ], {
+ "result_diarize": self.result_diarize
+ }):
+ prog_disp("Translating...", 0.70, is_gui, progress=progress)
+ lang_source = (
+ self.align_language
+ if self.align_language
+ else SOURCE_LANGUAGE
+ )
+ self.result_diarize["segments"] = translate_text(
+ self.result_diarize["segments"],
+ TRANSLATE_AUDIO_TO,
+ translate_process,
+ chunk_size=1800,
+ source=lang_source,
+ )
+ logger.debug("Translation complete")
+ logger.debug(self.result_diarize)
+
+ if get_translated_text:
+
+ json_data = []
+ for segment in self.result_diarize["segments"]:
+ start = segment["start"]
+ text = segment["text"]
+ speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1
+ json_data.append(
+ {"start": start, "text": text, "speaker": speaker}
+ )
+
+ # Convert list of dictionaries to a JSON string with indentation
+ json_string = json.dumps(json_data, indent=2)
+ logger.info("Done")
+ self.edit_subs_complete = True
+ return json_string.encode().decode("unicode_escape")
+
+ if get_video_from_text_json:
+
+ if self.result_diarize is None:
+ raise ValueError("Generate the transcription first.")
+ # with open('text_json.json', 'r') as file:
+ text_json_loaded = json.loads(text_json)
+ for i, segment in enumerate(self.result_diarize["segments"]):
+ segment["text"] = text_json_loaded[i]["text"]
+ segment["speaker"] = "SPEAKER_{:02d}".format(
+ int(text_json_loaded[i]["speaker"]) - 1
+ )
+
+ # Write subtitle
+ if not self.task_in_cache("subs_and_edit", [
+ copy.deepcopy(self.result_diarize),
+ output_format_subtitle,
+ TRANSLATE_AUDIO_TO
+ ], {
+ "result_diarize": self.result_diarize
+ }):
+ if output_format_subtitle == "disable":
+ self.sub_file = "sub_tra.srt"
+ elif output_format_subtitle != "ass":
+ self.sub_file = process_subtitles(
+ self.result_source_lang,
+ self.align_language,
+ self.result_diarize,
+ output_format_subtitle,
+ TRANSLATE_AUDIO_TO,
+ )
+
+ # Need task
+ if output_format_subtitle != "srt":
+ _ = process_subtitles(
+ self.result_source_lang,
+ self.align_language,
+ self.result_diarize,
+ "srt",
+ TRANSLATE_AUDIO_TO,
+ )
+
+ if output_format_subtitle == "ass":
+ convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y"
+ convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y"
+ self.sub_file = "sub_tra.ass"
+ run_command(convert_ori)
+ run_command(convert_tra)
+
+ format_sub = (
+ output_format_subtitle
+ if output_format_subtitle != "disable"
+ else "srt"
+ )
+
+ if output_type == "subtitle":
+
+ out_subs = []
+ tra_subs = media_out(
+ media_file,
+ TRANSLATE_AUDIO_TO,
+ video_output_name,
+ format_sub,
+ file_obj=self.sub_file,
+ )
+ out_subs.append(tra_subs)
+
+ ori_subs = media_out(
+ media_file,
+ self.align_language,
+ video_output_name,
+ format_sub,
+ file_obj=f"sub_ori.{format_sub}",
+ )
+ out_subs.append(ori_subs)
+ logger.info(f"Done: {out_subs}")
+ return out_subs
+
+ if output_type == "subtitle [by speaker]":
+ output = get_subtitle_speaker(
+ media_file,
+ result=self.result_diarize,
+ language=TRANSLATE_AUDIO_TO,
+ extension=format_sub,
+ base_name=video_output_name,
+ )
+ logger.info(f"Done: {str(output)}")
+ return output
+
+ if "video [subtitled]" in output_type:
+ output = media_out(
+ media_file,
+ TRANSLATE_AUDIO_TO + "_subtitled",
+ video_output_name,
+ "wav" if is_audio_file(media_file) else (
+ "mkv" if "mkv" in output_type else "mp4"
+ ),
+ file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file,
+ soft_subtitles=False if is_audio_file(media_file) else True,
+ subtitle_files=output_format_subtitle,
+ )
+ msg_out = output[0] if isinstance(output, list) else output
+ logger.info(f"Done: {msg_out}")
+ return output
+
+ if not self.task_in_cache("tts", [
+ TRANSLATE_AUDIO_TO,
+ tts_voice00,
+ tts_voice01,
+ tts_voice02,
+ tts_voice03,
+ tts_voice04,
+ tts_voice05,
+ tts_voice06,
+ tts_voice07,
+ tts_voice08,
+ tts_voice09,
+ tts_voice10,
+ tts_voice11,
+ dereverb_automatic_xtts
+ ], {
+ "sub_file": self.sub_file
+ }):
+ prog_disp("Text to speech...", 0.80, is_gui, progress=progress)
+ self.valid_speakers = audio_segmentation_to_voice(
+ self.result_diarize,
+ TRANSLATE_AUDIO_TO,
+ is_gui,
+ tts_voice00,
+ tts_voice01,
+ tts_voice02,
+ tts_voice03,
+ tts_voice04,
+ tts_voice05,
+ tts_voice06,
+ tts_voice07,
+ tts_voice08,
+ tts_voice09,
+ tts_voice10,
+ tts_voice11,
+ dereverb_automatic_xtts,
+ )
+
+ if not self.task_in_cache("acc_and_vc", [
+ max_accelerate_audio,
+ acceleration_rate_regulation,
+ voice_imitation,
+ voice_imitation_max_segments,
+ voice_imitation_remove_previous,
+ voice_imitation_vocals_dereverb,
+ voice_imitation_method,
+ custom_voices,
+ custom_voices_workers,
+ copy.deepcopy(self.vci.model_config),
+ avoid_overlap
+ ], {
+ "valid_speakers": self.valid_speakers
+ }):
+ audio_files, speakers_list = accelerate_segments(
+ self.result_diarize,
+ max_accelerate_audio,
+ self.valid_speakers,
+ acceleration_rate_regulation,
+ )
+
+ # Voice Imitation (Tone color converter)
+ if voice_imitation:
+ prog_disp(
+ "Voice Imitation...", 0.85, is_gui, progress=progress
+ )
+ from soni_translate.text_to_speech import toneconverter
+
+ try:
+ toneconverter(
+ copy.deepcopy(self.result_diarize),
+ voice_imitation_max_segments,
+ voice_imitation_remove_previous,
+ voice_imitation_vocals_dereverb,
+ voice_imitation_method,
+ )
+ except Exception as error:
+ logger.error(str(error))
+
+ # custom voice
+ if custom_voices:
+ prog_disp(
+ "Applying customized voices...",
+ 0.90,
+ is_gui,
+ progress=progress,
+ )
+
+ try:
+ self.vci(
+ audio_files,
+ speakers_list,
+ overwrite=True,
+ parallel_workers=custom_voices_workers,
+ )
+ self.vci.unload_models()
+ except Exception as error:
+ logger.error(str(error))
+
+ prog_disp(
+ "Creating final translated video...",
+ 0.95,
+ is_gui,
+ progress=progress,
+ )
+ remove_files(dub_audio_file)
+ create_translated_audio(
+ self.result_diarize,
+ audio_files,
+ dub_audio_file,
+ False,
+ avoid_overlap,
+ )
+
+ # Voiceless track, change with file
+ hash_base_audio_wav = get_hash(base_audio_wav)
+ if voiceless_track:
+ if self.voiceless_id != hash_base_audio_wav:
+ from soni_translate.mdx_net import process_uvr_task
+
+ try:
+ # voiceless_audio_file_dir = "clean_song_output/voiceless"
+ remove_files(voiceless_audio_file)
+ uvr_voiceless_audio_wav, _ = process_uvr_task(
+ orig_song_path=base_audio_wav,
+ song_id="voiceless",
+ only_voiceless=True,
+ remove_files_output_dir=False,
+ )
+ copy_files(uvr_voiceless_audio_wav, ".")
+ base_audio_wav = voiceless_audio_file
+ self.voiceless_id = hash_base_audio_wav
+
+ except Exception as error:
+ logger.error(str(error))
+ else:
+ base_audio_wav = voiceless_audio_file
+
+ if not self.task_in_cache("mix_aud", [
+ mix_method_audio,
+ volume_original_audio,
+ volume_translated_audio,
+ voiceless_track
+ ], {}):
+ # TYPE MIX AUDIO
+ remove_files(mix_audio_file)
+ command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}'
+ command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}'
+ if mix_method_audio == "Adjusting volumes and mixing audio":
+ # volume mix
+ run_command(command_volume_mix)
+ else:
+ try:
+ # background mix
+ run_command(command_background_mix)
+ except Exception as error_mix:
+ # volume mix except
+ logger.error(str(error_mix))
+ run_command(command_volume_mix)
+
+ if "audio" in output_type or is_audio_file(media_file):
+ output = media_out(
+ media_file,
+ TRANSLATE_AUDIO_TO,
+ video_output_name,
+ "wav" if "wav" in output_type else (
+ "ogg" if "ogg" in output_type else "mp3"
+ ),
+ file_obj=mix_audio_file,
+ subtitle_files=output_format_subtitle,
+ )
+ msg_out = output[0] if isinstance(output, list) else output
+ logger.info(f"Done: {msg_out}")
+ return output
+
+ hash_base_video_file = get_hash(base_video_file)
+
+ if burn_subtitles_to_video:
+ hashvideo_text = [
+ hash_base_video_file,
+ [seg["text"] for seg in self.result_diarize["segments"]]
+ ]
+ if self.burn_subs_id != hashvideo_text:
+ try:
+ logger.info("Burn subtitles")
+ remove_files(vid_subs)
+ command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}"
+ run_command(command)
+ base_video_file = vid_subs
+ self.burn_subs_id = hashvideo_text
+ except Exception as error:
+ logger.error(str(error))
+ else:
+ base_video_file = vid_subs
+
+ if not self.task_in_cache("output", [
+ hash_base_video_file,
+ hash_base_audio_wav,
+ burn_subtitles_to_video
+ ], {}):
+ # Merge new audio + video
+ remove_files(video_output_file)
+ run_command(
+ f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}"
+ )
+
+ output = media_out(
+ media_file,
+ TRANSLATE_AUDIO_TO,
+ video_output_name,
+ "mkv" if "mkv" in output_type else "mp4",
+ file_obj=video_output_file,
+ soft_subtitles=soft_subtitles_to_video,
+ subtitle_files=output_format_subtitle,
+ )
+ msg_out = output[0] if isinstance(output, list) else output
+ logger.info(f"Done: {msg_out}")
+
+ return output
+
+ def hook_beta_processor(
+ self,
+ document,
+ tgt_lang,
+ translate_process,
+ ori_lang,
+ tts,
+ name_final_file,
+ custom_voices,
+ custom_voices_workers,
+ output_type,
+ chunk_size,
+ width,
+ height,
+ start_page,
+ end_page,
+ bcolor,
+ is_gui,
+ progress
+ ):
+ prog_disp("Processing pages...", 0.10, is_gui, progress=progress)
+ doc_data = doc_to_txtximg_pages(document, width, height, start_page, end_page, bcolor)
+ result_diarize = page_data_to_segments(doc_data, 1700)
+
+ prog_disp("Translating...", 0.20, is_gui, progress=progress)
+ result_diarize["segments"] = translate_text(
+ result_diarize["segments"],
+ tgt_lang,
+ translate_process,
+ chunk_size=0,
+ source=ori_lang,
+ )
+ chunk_size = (
+ chunk_size if chunk_size else determine_chunk_size(tts)
+ )
+ doc_data = update_page_data(result_diarize, doc_data)
+
+ prog_disp("Text to speech...", 0.30, is_gui, progress=progress)
+ result_diarize = page_data_to_segments(doc_data, chunk_size)
+ valid_speakers = audio_segmentation_to_voice(
+ result_diarize,
+ tgt_lang,
+ is_gui,
+ tts,
+ )
+
+ # fix format and set folder output
+ audio_files, speakers_list = accelerate_segments(
+ result_diarize,
+ 1.0,
+ valid_speakers,
+ )
+
+ # custom voice
+ if custom_voices:
+ prog_disp(
+ "Applying customized voices...",
+ 0.60,
+ is_gui,
+ progress=progress,
+ )
+ self.vci(
+ audio_files,
+ speakers_list,
+ overwrite=True,
+ parallel_workers=custom_voices_workers,
+ )
+ self.vci.unload_models()
+
+ # Update time segments and not concat
+ result_diarize = fix_timestamps_docs(result_diarize, audio_files)
+ final_wav_file = "audio_book.wav"
+ remove_files(final_wav_file)
+
+ prog_disp("Creating audio file...", 0.70, is_gui, progress=progress)
+ create_translated_audio(
+ result_diarize, audio_files, final_wav_file, False
+ )
+
+ prog_disp("Creating video file...", 0.80, is_gui, progress=progress)
+ video_doc = create_video_from_images(
+ doc_data,
+ result_diarize
+ )
+
+ # Merge video and audio
+ prog_disp("Merging...", 0.90, is_gui, progress=progress)
+ vid_out = merge_video_and_audio(video_doc, final_wav_file)
+
+ # End
+ output = media_out(
+ document,
+ tgt_lang,
+ name_final_file,
+ "mkv" if "mkv" in output_type else "mp4",
+ file_obj=vid_out,
+ )
+ logger.info(f"Done: {output}")
+ return output
+
+ def multilingual_docs_conversion(
+ self,
+ string_text="", # string
+ document=None, # doc path gui
+ directory_input="", # doc path
+ origin_language="English (en)",
+ target_language="English (en)",
+ tts_voice00="en-US-EmmaMultilingualNeural-Female",
+ name_final_file="",
+ translate_process="google_translator",
+ output_type="audio",
+ chunk_size=None,
+ custom_voices=False,
+ custom_voices_workers=1,
+ start_page=1,
+ end_page=99999,
+ width=1280,
+ height=720,
+ bcolor="dynamic",
+ is_gui=False,
+ progress=gr.Progress(),
+ ):
+ if "gpt" in translate_process:
+ check_openai_api_key()
+
+ SOURCE_LANGUAGE = LANGUAGES[origin_language]
+ if translate_process != "disable_translation":
+ TRANSLATE_AUDIO_TO = LANGUAGES[target_language]
+ else:
+ TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE
+ logger.info("No translation")
+ if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower():
+ logger.debug(
+ "Make sure to select a 'TTS Speaker' suitable for the "
+ "translation language to avoid errors with the TTS."
+ )
+
+ self.clear_cache(string_text, force=True)
+
+ is_string = False
+ if document is None:
+ if os.path.exists(directory_input):
+ document = directory_input
+ else:
+ document = string_text
+ is_string = True
+ document = document if isinstance(document, str) else document.name
+ if not document:
+ raise Exception("No data found")
+
+ if "videobook" in output_type:
+ if not document.lower().endswith(".pdf"):
+ raise ValueError(
+ "Videobooks are only compatible with PDF files."
+ )
+
+ return self.hook_beta_processor(
+ document,
+ TRANSLATE_AUDIO_TO,
+ translate_process,
+ SOURCE_LANGUAGE,
+ tts_voice00,
+ name_final_file,
+ custom_voices,
+ custom_voices_workers,
+ output_type,
+ chunk_size,
+ width,
+ height,
+ start_page,
+ end_page,
+ bcolor,
+ is_gui,
+ progress
+ )
+
+ # audio_wav = "audio.wav"
+ final_wav_file = "audio_book.wav"
+
+ prog_disp("Processing text...", 0.15, is_gui, progress=progress)
+ result_file_path, result_text = document_preprocessor(
+ document, is_string, start_page, end_page
+ )
+
+ if (
+ output_type == "book (txt)"
+ and translate_process == "disable_translation"
+ ):
+ return result_file_path
+
+ if "SET_LIMIT" == os.getenv("DEMO"):
+ result_text = result_text[:50]
+ logger.info(
+ "DEMO; Generation is limited to 50 characters to prevent "
+ "CPU errors. No limitations with GPU.\n"
+ )
+
+ if translate_process != "disable_translation":
+ # chunks text for translation
+ result_diarize = plain_text_to_segments(result_text, 1700)
+ prog_disp("Translating...", 0.30, is_gui, progress=progress)
+ # not or iterative with 1700 chars
+ result_diarize["segments"] = translate_text(
+ result_diarize["segments"],
+ TRANSLATE_AUDIO_TO,
+ translate_process,
+ chunk_size=0,
+ source=SOURCE_LANGUAGE,
+ )
+
+ txt_file_path, result_text = segments_to_plain_text(result_diarize)
+
+ if output_type == "book (txt)":
+ return media_out(
+ result_file_path if is_string else document,
+ TRANSLATE_AUDIO_TO,
+ name_final_file,
+ "txt",
+ file_obj=txt_file_path,
+ )
+
+ # (TTS limits) plain text to result_diarize
+ chunk_size = (
+ chunk_size if chunk_size else determine_chunk_size(tts_voice00)
+ )
+ result_diarize = plain_text_to_segments(result_text, chunk_size)
+ logger.debug(result_diarize)
+
+ prog_disp("Text to speech...", 0.45, is_gui, progress=progress)
+ valid_speakers = audio_segmentation_to_voice(
+ result_diarize,
+ TRANSLATE_AUDIO_TO,
+ is_gui,
+ tts_voice00,
+ )
+
+ # fix format and set folder output
+ audio_files, speakers_list = accelerate_segments(
+ result_diarize,
+ 1.0,
+ valid_speakers,
+ )
+
+ # custom voice
+ if custom_voices:
+ prog_disp(
+ "Applying customized voices...",
+ 0.80,
+ is_gui,
+ progress=progress,
+ )
+ self.vci(
+ audio_files,
+ speakers_list,
+ overwrite=True,
+ parallel_workers=custom_voices_workers,
+ )
+ self.vci.unload_models()
+
+ prog_disp(
+ "Creating final audio file...", 0.90, is_gui, progress=progress
+ )
+ remove_files(final_wav_file)
+ create_translated_audio(
+ result_diarize, audio_files, final_wav_file, True
+ )
+
+ output = media_out(
+ result_file_path if is_string else document,
+ TRANSLATE_AUDIO_TO,
+ name_final_file,
+ "mp3" if "mp3" in output_type else (
+ "ogg" if "ogg" in output_type else "wav"
+ ),
+ file_obj=final_wav_file,
+ )
+
+ logger.info(f"Done: {output}")
+
+ return output
+
+
+title = "📽️ SoniTranslate 🈷️"
+
+
+def create_gui(theme, logs_in_gui=False):
+ with gr.Blocks(theme=theme) as app:
+ gr.Markdown(title)
+ gr.Markdown(lg_conf["description"])
+
+ with gr.Tab(lg_conf["tab_translate"]):
+ with gr.Row():
+ with gr.Column():
+ input_data_type = gr.Dropdown(
+ ["SUBMIT VIDEO", "URL", "Find Video Path"],
+ value="SUBMIT VIDEO",
+ label=lg_conf["video_source"],
+ )
+
+ def swap_visibility(data_type):
+ if data_type == "URL":
+ return (
+ gr.update(visible=False, value=None),
+ gr.update(visible=True, value=""),
+ gr.update(visible=False, value=""),
+ )
+ elif data_type == "SUBMIT VIDEO":
+ return (
+ gr.update(visible=True, value=None),
+ gr.update(visible=False, value=""),
+ gr.update(visible=False, value=""),
+ )
+ elif data_type == "Find Video Path":
+ return (
+ gr.update(visible=False, value=None),
+ gr.update(visible=False, value=""),
+ gr.update(visible=True, value=""),
+ )
+
+ video_input = gr.File(
+ label="VIDEO",
+ file_count="multiple",
+ type="filepath",
+ )
+ blink_input = gr.Textbox(
+ visible=False,
+ label=lg_conf["link_label"],
+ info=lg_conf["link_info"],
+ placeholder=lg_conf["link_ph"],
+ )
+ directory_input = gr.Textbox(
+ visible=False,
+ label=lg_conf["dir_label"],
+ info=lg_conf["dir_info"],
+ placeholder=lg_conf["dir_ph"],
+ )
+ input_data_type.change(
+ fn=swap_visibility,
+ inputs=input_data_type,
+ outputs=[video_input, blink_input, directory_input],
+ )
+
+ gr.HTML()
+
+ SOURCE_LANGUAGE = gr.Dropdown(
+ LANGUAGES_LIST,
+ value=LANGUAGES_LIST[0],
+ label=lg_conf["sl_label"],
+ info=lg_conf["sl_info"],
+ )
+ TRANSLATE_AUDIO_TO = gr.Dropdown(
+ LANGUAGES_LIST[1:],
+ value="English (en)",
+ label=lg_conf["tat_label"],
+ info=lg_conf["tat_info"],
+ )
+
+ gr.HTML("
")
+
+ gr.Markdown(lg_conf["num_speakers"])
+ MAX_TTS = 12
+ min_speakers = gr.Slider(
+ 1,
+ MAX_TTS,
+ value=1,
+ label=lg_conf["min_sk"],
+ step=1,
+ visible=False,
+ )
+ max_speakers = gr.Slider(
+ 1,
+ MAX_TTS,
+ value=2,
+ step=1,
+ label=lg_conf["max_sk"],
+ )
+ gr.Markdown(lg_conf["tts_select"])
+
+ def submit(value):
+ visibility_dict = {
+ f"tts_voice{i:02d}": gr.update(visible=i < value)
+ for i in range(MAX_TTS)
+ }
+ return [value for value in visibility_dict.values()]
+
+ tts_voice00 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="en-US-EmmaMultilingualNeural-Female",
+ label=lg_conf["sk1"],
+ visible=True,
+ interactive=True,
+ )
+ tts_voice01 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="en-US-AndrewMultilingualNeural-Male",
+ label=lg_conf["sk2"],
+ visible=True,
+ interactive=True,
+ )
+ tts_voice02 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="en-US-AvaMultilingualNeural-Female",
+ label=lg_conf["sk3"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice03 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="en-US-BrianMultilingualNeural-Male",
+ label=lg_conf["sk4"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice04 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="de-DE-SeraphinaMultilingualNeural-Female",
+ label=lg_conf["sk4"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice05 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="de-DE-FlorianMultilingualNeural-Male",
+ label=lg_conf["sk6"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice06 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="fr-FR-VivienneMultilingualNeural-Female",
+ label=lg_conf["sk7"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice07 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="fr-FR-RemyMultilingualNeural-Male",
+ label=lg_conf["sk8"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice08 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="en-US-EmmaMultilingualNeural-Female",
+ label=lg_conf["sk9"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice09 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="en-US-AndrewMultilingualNeural-Male",
+ label=lg_conf["sk10"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice10 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="en-US-EmmaMultilingualNeural-Female",
+ label=lg_conf["sk11"],
+ visible=False,
+ interactive=True,
+ )
+ tts_voice11 = gr.Dropdown(
+ SoniTr.tts_info.tts_list(),
+ value="en-US-AndrewMultilingualNeural-Male",
+ label=lg_conf["sk12"],
+ visible=False,
+ interactive=True,
+ )
+ max_speakers.change(
+ submit,
+ max_speakers,
+ [
+ tts_voice00,
+ tts_voice01,
+ tts_voice02,
+ tts_voice03,
+ tts_voice04,
+ tts_voice05,
+ tts_voice06,
+ tts_voice07,
+ tts_voice08,
+ tts_voice09,
+ tts_voice10,
+ tts_voice11,
+ ],
+ )
+
+ with gr.Column():
+ with gr.Accordion(
+ lg_conf["vc_title"],
+ open=False,
+ ):
+ gr.Markdown(lg_conf["vc_subtitle"])
+ voice_imitation_gui = gr.Checkbox(
+ False,
+ label=lg_conf["vc_active_label"],
+ info=lg_conf["vc_active_info"],
+ )
+ openvoice_models = ["openvoice", "openvoice_v2"]
+ voice_imitation_method_options = (
+ ["freevc"] + openvoice_models
+ if SoniTr.tts_info.xtts_enabled
+ else openvoice_models
+ )
+ voice_imitation_method_gui = gr.Dropdown(
+ voice_imitation_method_options,
+ value=voice_imitation_method_options[0],
+ label=lg_conf["vc_method_label"],
+ info=lg_conf["vc_method_info"],
+ )
+ voice_imitation_max_segments_gui = gr.Slider(
+ label=lg_conf["vc_segments_label"],
+ info=lg_conf["vc_segments_info"],
+ value=3,
+ step=1,
+ minimum=1,
+ maximum=10,
+ visible=True,
+ interactive=True,
+ )
+ voice_imitation_vocals_dereverb_gui = gr.Checkbox(
+ False,
+ label=lg_conf["vc_dereverb_label"],
+ info=lg_conf["vc_dereverb_info"],
+ )
+ voice_imitation_remove_previous_gui = gr.Checkbox(
+ True,
+ label=lg_conf["vc_remove_label"],
+ info=lg_conf["vc_remove_info"],
+ )
+
+ if SoniTr.tts_info.xtts_enabled:
+ with gr.Column():
+ with gr.Accordion(
+ lg_conf["xtts_title"],
+ open=False,
+ ):
+ gr.Markdown(lg_conf["xtts_subtitle"])
+ wav_speaker_file = gr.File(
+ label=lg_conf["xtts_file_label"]
+ )
+ wav_speaker_name = gr.Textbox(
+ label=lg_conf["xtts_name_label"],
+ value="",
+ info=lg_conf["xtts_name_info"],
+ placeholder="default_name",
+ lines=1,
+ )
+ wav_speaker_start = gr.Number(
+ label="Time audio start",
+ value=0,
+ visible=False,
+ )
+ wav_speaker_end = gr.Number(
+ label="Time audio end",
+ value=0,
+ visible=False,
+ )
+ wav_speaker_dir = gr.Textbox(
+ label="Directory save",
+ value="_XTTS_",
+ visible=False,
+ )
+ wav_speaker_dereverb = gr.Checkbox(
+ True,
+ label=lg_conf["xtts_dereverb_label"],
+ info=lg_conf["xtts_dereverb_info"]
+ )
+ wav_speaker_output = gr.HTML()
+ create_xtts_wav = gr.Button(
+ lg_conf["xtts_button"]
+ )
+ gr.Markdown(lg_conf["xtts_footer"])
+ else:
+ wav_speaker_dereverb = gr.Checkbox(
+ False,
+ label=lg_conf["xtts_dereverb_label"],
+ info=lg_conf["xtts_dereverb_info"],
+ visible=False
+ )
+
+ with gr.Column():
+ with gr.Accordion(
+ lg_conf["extra_setting"], open=False
+ ):
+ audio_accelerate = gr.Slider(
+ label=lg_conf["acc_max_label"],
+ value=1.9,
+ step=0.1,
+ minimum=1.0,
+ maximum=2.5,
+ visible=True,
+ interactive=True,
+ info=lg_conf["acc_max_info"],
+ )
+ acceleration_rate_regulation_gui = gr.Checkbox(
+ False,
+ label=lg_conf["acc_rate_label"],
+ info=lg_conf["acc_rate_info"],
+ )
+ avoid_overlap_gui = gr.Checkbox(
+ False,
+ label=lg_conf["or_label"],
+ info=lg_conf["or_info"],
+ )
+
+ gr.HTML("
")
+
+ audio_mix_options = [
+ "Mixing audio with sidechain compression",
+ "Adjusting volumes and mixing audio",
+ ]
+ AUDIO_MIX = gr.Dropdown(
+ audio_mix_options,
+ value=audio_mix_options[1],
+ label=lg_conf["aud_mix_label"],
+ info=lg_conf["aud_mix_info"],
+ )
+ volume_original_mix = gr.Slider(
+ label=lg_conf["vol_ori"],
+ info="for Adjusting volumes and mixing audio",
+ value=0.25,
+ step=0.05,
+ minimum=0.0,
+ maximum=2.50,
+ visible=True,
+ interactive=True,
+ )
+ volume_translated_mix = gr.Slider(
+ label=lg_conf["vol_tra"],
+ info="for Adjusting volumes and mixing audio",
+ value=1.80,
+ step=0.05,
+ minimum=0.0,
+ maximum=2.50,
+ visible=True,
+ interactive=True,
+ )
+ main_voiceless_track = gr.Checkbox(
+ label=lg_conf["voiceless_tk_label"],
+ info=lg_conf["voiceless_tk_info"],
+ )
+
+ gr.HTML("
")
+ sub_type_options = [
+ "disable",
+ "srt",
+ "vtt",
+ "ass",
+ "txt",
+ "tsv",
+ "json",
+ "aud",
+ ]
+
+ sub_type_output = gr.Dropdown(
+ sub_type_options,
+ value=sub_type_options[1],
+ label=lg_conf["sub_type"],
+ )
+ soft_subtitles_to_video_gui = gr.Checkbox(
+ label=lg_conf["soft_subs_label"],
+ info=lg_conf["soft_subs_info"],
+ )
+ burn_subtitles_to_video_gui = gr.Checkbox(
+ label=lg_conf["burn_subs_label"],
+ info=lg_conf["burn_subs_info"],
+ )
+
+ gr.HTML("
")
+ gr.Markdown(lg_conf["whisper_title"])
+ literalize_numbers_gui = gr.Checkbox(
+ True,
+ label=lg_conf["lnum_label"],
+ info=lg_conf["lnum_info"],
+ )
+ vocal_refinement_gui = gr.Checkbox(
+ False,
+ label=lg_conf["scle_label"],
+ info=lg_conf["scle_info"],
+ )
+ segment_duration_limit_gui = gr.Slider(
+ label=lg_conf["sd_limit_label"],
+ info=lg_conf["sd_limit_info"],
+ value=15,
+ step=1,
+ minimum=1,
+ maximum=30,
+ )
+ whisper_model_default = (
+ "large-v3"
+ if SoniTr.device == "cuda"
+ else "medium"
+ )
+
+ WHISPER_MODEL_SIZE = gr.Dropdown(
+ ASR_MODEL_OPTIONS + find_whisper_models(),
+ value=whisper_model_default,
+ label="Whisper ASR model",
+ info=lg_conf["asr_model_info"],
+ allow_custom_value=True,
+ )
+ com_t_opt, com_t_default = (
+ [COMPUTE_TYPE_GPU, "float16"]
+ if SoniTr.device == "cuda"
+ else [COMPUTE_TYPE_CPU, "float32"]
+ )
+ compute_type = gr.Dropdown(
+ com_t_opt,
+ value=com_t_default,
+ label=lg_conf["ctype_label"],
+ info=lg_conf["ctype_info"],
+ )
+ batch_size = gr.Slider(
+ minimum=1,
+ maximum=32,
+ value=8,
+ label=lg_conf["batchz_label"],
+ info=lg_conf["batchz_info"],
+ step=1,
+ )
+ input_srt = gr.File(
+ label=lg_conf["srt_file_label"],
+ file_types=[".srt", ".ass", ".vtt"],
+ height=130,
+ )
+
+ gr.HTML("
")
+ text_segmentation_options = [
+ "sentence",
+ "word",
+ "character"
+ ]
+ text_segmentation_scale_gui = gr.Dropdown(
+ text_segmentation_options,
+ value=text_segmentation_options[0],
+ label=lg_conf["tsscale_label"],
+ info=lg_conf["tsscale_info"],
+ )
+ divide_text_segments_by_gui = gr.Textbox(
+ label=lg_conf["divide_text_label"],
+ value="",
+ info=lg_conf["divide_text_info"],
+ )
+
+ gr.HTML("
")
+ pyannote_models_list = list(
+ diarization_models.keys()
+ )
+ diarization_process_dropdown = gr.Dropdown(
+ pyannote_models_list,
+ value=pyannote_models_list[1],
+ label=lg_conf["diarization_label"],
+ )
+ translate_process_dropdown = gr.Dropdown(
+ TRANSLATION_PROCESS_OPTIONS,
+ value=TRANSLATION_PROCESS_OPTIONS[0],
+ label=lg_conf["tr_process_label"],
+ )
+
+ gr.HTML("
")
+ main_output_type = gr.Dropdown(
+ OUTPUT_TYPE_OPTIONS,
+ value=OUTPUT_TYPE_OPTIONS[0],
+ label=lg_conf["out_type_label"],
+ )
+ VIDEO_OUTPUT_NAME = gr.Textbox(
+ label=lg_conf["out_name_label"],
+ value="",
+ info=lg_conf["out_name_info"],
+ )
+ play_sound_gui = gr.Checkbox(
+ True,
+ label=lg_conf["task_sound_label"],
+ info=lg_conf["task_sound_info"],
+ )
+ enable_cache_gui = gr.Checkbox(
+ True,
+ label=lg_conf["cache_label"],
+ info=lg_conf["cache_info"],
+ )
+ PREVIEW = gr.Checkbox(
+ label="Preview", info=lg_conf["preview_info"]
+ )
+ is_gui_dummy_check = gr.Checkbox(
+ True, visible=False
+ )
+
+ with gr.Column(variant="compact"):
+ edit_sub_check = gr.Checkbox(
+ label=lg_conf["edit_sub_label"],
+ info=lg_conf["edit_sub_info"],
+ )
+ dummy_false_check = gr.Checkbox(
+ False,
+ visible=False,
+ )
+
+ def visible_component_subs(input_bool):
+ if input_bool:
+ return gr.update(visible=True), gr.update(
+ visible=True
+ )
+ else:
+ return gr.update(visible=False), gr.update(
+ visible=False
+ )
+
+ subs_button = gr.Button(
+ lg_conf["button_subs"],
+ variant="primary",
+ visible=False,
+ )
+ subs_edit_space = gr.Textbox(
+ visible=False,
+ lines=10,
+ label=lg_conf["editor_sub_label"],
+ info=lg_conf["editor_sub_info"],
+ placeholder=lg_conf["editor_sub_ph"],
+ )
+ edit_sub_check.change(
+ visible_component_subs,
+ [edit_sub_check],
+ [subs_button, subs_edit_space],
+ )
+
+ with gr.Row():
+ video_button = gr.Button(
+ lg_conf["button_translate"],
+ variant="primary",
+ )
+ with gr.Row():
+ video_output = gr.File(
+ label=lg_conf["output_result_label"],
+ file_count="multiple",
+ interactive=False,
+
+ ) # gr.Video()
+
+ gr.HTML("
")
+
+ if (
+ os.getenv("YOUR_HF_TOKEN") is None
+ or os.getenv("YOUR_HF_TOKEN") == ""
+ ):
+ HFKEY = gr.Textbox(
+ visible=True,
+ label="HF Token",
+ info=lg_conf["ht_token_info"],
+ placeholder=lg_conf["ht_token_ph"],
+ )
+ else:
+ HFKEY = gr.Textbox(
+ visible=False,
+ label="HF Token",
+ info=lg_conf["ht_token_info"],
+ placeholder=lg_conf["ht_token_ph"],
+ )
+
+ gr.Examples(
+ examples=[
+ [
+ ["./assets/Video_main.mp4"],
+ "",
+ "",
+ "",
+ False,
+ whisper_model_default,
+ 4,
+ com_t_default,
+ "Spanish (es)",
+ "English (en)",
+ 1,
+ 2,
+ "en-CA-ClaraNeural-Female",
+ "en-AU-WilliamNeural-Male",
+ ],
+ ], # no update
+ fn=SoniTr.batch_multilingual_media_conversion,
+ inputs=[
+ video_input,
+ blink_input,
+ directory_input,
+ HFKEY,
+ PREVIEW,
+ WHISPER_MODEL_SIZE,
+ batch_size,
+ compute_type,
+ SOURCE_LANGUAGE,
+ TRANSLATE_AUDIO_TO,
+ min_speakers,
+ max_speakers,
+ tts_voice00,
+ tts_voice01,
+ ],
+ outputs=[video_output],
+ cache_examples=False,
+ )
+
+ with gr.Tab(lg_conf["tab_docs"]):
+ with gr.Column():
+ with gr.Accordion("Docs", open=True):
+ with gr.Column(variant="compact"):
+ with gr.Column():
+ input_doc_type = gr.Dropdown(
+ [
+ "WRITE TEXT",
+ "SUBMIT DOCUMENT",
+ "Find Document Path",
+ ],
+ value="SUBMIT DOCUMENT",
+ label=lg_conf["docs_input_label"],
+ info=lg_conf["docs_input_info"],
+ )
+
+ def swap_visibility(data_type):
+ if data_type == "WRITE TEXT":
+ return (
+ gr.update(visible=True, value=""),
+ gr.update(visible=False, value=None),
+ gr.update(visible=False, value=""),
+ )
+ elif data_type == "SUBMIT DOCUMENT":
+ return (
+ gr.update(visible=False, value=""),
+ gr.update(visible=True, value=None),
+ gr.update(visible=False, value=""),
+ )
+ elif data_type == "Find Document Path":
+ return (
+ gr.update(visible=False, value=""),
+ gr.update(visible=False, value=None),
+ gr.update(visible=True, value=""),
+ )
+
+ text_docs = gr.Textbox(
+ label="Text",
+ value="This is an example",
+ info="Write a text",
+ placeholder="...",
+ lines=5,
+ visible=False,
+ )
+ input_docs = gr.File(
+ label="Document", visible=True
+ )
+ directory_input_docs = gr.Textbox(
+ visible=False,
+ label="Document Path",
+ info="Example: /home/my_doc.pdf",
+ placeholder="Path goes here...",
+ )
+ input_doc_type.change(
+ fn=swap_visibility,
+ inputs=input_doc_type,
+ outputs=[
+ text_docs,
+ input_docs,
+ directory_input_docs,
+ ],
+ )
+
+ gr.HTML()
+
+ tts_documents = gr.Dropdown(
+ list(
+ filter(
+ lambda x: x != "_XTTS_/AUTOMATIC.wav",
+ SoniTr.tts_info.tts_list(),
+ )
+ ),
+ value="en-US-EmmaMultilingualNeural-Female",
+ label="TTS",
+ visible=True,
+ interactive=True,
+ )
+
+ gr.HTML()
+
+ docs_SOURCE_LANGUAGE = gr.Dropdown(
+ LANGUAGES_LIST[1:],
+ value="English (en)",
+ label=lg_conf["sl_label"],
+ info=lg_conf["docs_source_info"],
+ )
+ docs_TRANSLATE_TO = gr.Dropdown(
+ LANGUAGES_LIST[1:],
+ value="English (en)",
+ label=lg_conf["tat_label"],
+ info=lg_conf["tat_info"],
+ )
+
+ with gr.Column():
+ with gr.Accordion(
+ lg_conf["extra_setting"], open=False
+ ):
+ docs_translate_process_dropdown = gr.Dropdown(
+ DOCS_TRANSLATION_PROCESS_OPTIONS,
+ value=DOCS_TRANSLATION_PROCESS_OPTIONS[
+ 0
+ ],
+ label="Translation process",
+ )
+
+ gr.HTML("
")
+
+ docs_output_type = gr.Dropdown(
+ DOCS_OUTPUT_TYPE_OPTIONS,
+ value=DOCS_OUTPUT_TYPE_OPTIONS[2],
+ label="Output type",
+ )
+ docs_OUTPUT_NAME = gr.Textbox(
+ label="Final file name",
+ value="",
+ info=lg_conf["out_name_info"],
+ )
+ docs_chunk_size = gr.Number(
+ label=lg_conf["chunk_size_label"],
+ value=0,
+ visible=True,
+ interactive=True,
+ info=lg_conf["chunk_size_info"],
+ )
+ gr.HTML("
")
+ start_page_gui = gr.Number(
+ step=1,
+ value=1,
+ minimum=1,
+ maximum=99999,
+ label="Start page",
+ )
+ end_page_gui = gr.Number(
+ step=1,
+ value=99999,
+ minimum=1,
+ maximum=99999,
+ label="End page",
+ )
+ gr.HTML("
Videobook config")
+ videobook_width_gui = gr.Number(
+ step=1,
+ value=1280,
+ minimum=100,
+ maximum=4096,
+ label="Width",
+ )
+ videobook_height_gui = gr.Number(
+ step=1,
+ value=720,
+ minimum=100,
+ maximum=4096,
+ label="Height",
+ )
+ videobook_bcolor_gui = gr.Dropdown(
+ BORDER_COLORS,
+ value=BORDER_COLORS[0],
+ label="Border color",
+ )
+ docs_dummy_check = gr.Checkbox(
+ True, visible=False
+ )
+
+ with gr.Row():
+ docs_button = gr.Button(
+ lg_conf["docs_button"],
+ variant="primary",
+ )
+ with gr.Row():
+ docs_output = gr.File(
+ label="Result",
+ interactive=False,
+ )
+
+ with gr.Tab("Custom voice R.V.C. (Optional)"):
+
+ with gr.Column():
+ with gr.Accordion("Get the R.V.C. Models", open=True):
+ url_links = gr.Textbox(
+ label="URLs",
+ value="",
+ info=lg_conf["cv_url_info"],
+ placeholder="urls here...",
+ lines=1,
+ )
+ download_finish = gr.HTML()
+ download_button = gr.Button("DOWNLOAD MODELS")
+
+ def update_models():
+ models_path, index_path = upload_model_list()
+
+ dict_models = {
+ f"fmodel{i:02d}": gr.update(
+ choices=models_path
+ )
+ for i in range(MAX_TTS+1)
+ }
+ dict_index = {
+ f"findex{i:02d}": gr.update(
+ choices=index_path, value=None
+ )
+ for i in range(MAX_TTS+1)
+ }
+ dict_changes = {**dict_models, **dict_index}
+ return [value for value in dict_changes.values()]
+
+ with gr.Column():
+ with gr.Accordion(lg_conf["replace_title"], open=False):
+ with gr.Column(variant="compact"):
+ with gr.Column():
+ gr.Markdown(lg_conf["sec1_title"])
+ enable_custom_voice = gr.Checkbox(
+ False,
+ label="ENABLE",
+ info=lg_conf["enable_replace"]
+ )
+ workers_custom_voice = gr.Number(
+ step=1,
+ value=1,
+ minimum=1,
+ maximum=50,
+ label="workers",
+ visible=False,
+ )
+
+ gr.Markdown(lg_conf["sec2_title"])
+ gr.Markdown(lg_conf["sec2_subtitle"])
+
+ PITCH_ALGO_OPT = [
+ "pm",
+ "harvest",
+ "crepe",
+ "rmvpe",
+ "rmvpe+",
+ ]
+
+ def model_conf():
+ return gr.Dropdown(
+ models_path,
+ # value="",
+ label="Model",
+ visible=True,
+ interactive=True,
+ )
+
+ def pitch_algo_conf():
+ return gr.Dropdown(
+ PITCH_ALGO_OPT,
+ value=PITCH_ALGO_OPT[3],
+ label="Pitch algorithm",
+ visible=True,
+ interactive=True,
+ )
+
+ def pitch_lvl_conf():
+ return gr.Slider(
+ label="Pitch level",
+ minimum=-24,
+ maximum=24,
+ step=1,
+ value=0,
+ visible=True,
+ interactive=True,
+ )
+
+ def index_conf():
+ return gr.Dropdown(
+ index_path,
+ value=None,
+ label="Index",
+ visible=True,
+ interactive=True,
+ )
+
+ def index_inf_conf():
+ return gr.Slider(
+ minimum=0,
+ maximum=1,
+ label="Index influence",
+ value=0.75,
+ )
+
+ def respiration_filter_conf():
+ return gr.Slider(
+ minimum=0,
+ maximum=7,
+ label="Respiration median filtering",
+ value=3,
+ step=1,
+ interactive=True,
+ )
+
+ def envelope_ratio_conf():
+ return gr.Slider(
+ minimum=0,
+ maximum=1,
+ label="Envelope ratio",
+ value=0.25,
+ interactive=True,
+ )
+
+ def consonant_protec_conf():
+ return gr.Slider(
+ minimum=0,
+ maximum=0.5,
+ label="Consonant breath protection",
+ value=0.5,
+ interactive=True,
+ )
+
+ def button_conf(tts_name):
+ return gr.Button(
+ lg_conf["cv_button_apply"]+" "+tts_name,
+ variant="primary",
+ )
+
+ TTS_TABS = [
+ 'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1)
+ ]
+
+ CV_SUBTITLES = [
+ lg_conf["cv_tts1"],
+ lg_conf["cv_tts2"],
+ lg_conf["cv_tts3"],
+ lg_conf["cv_tts4"],
+ lg_conf["cv_tts5"],
+ lg_conf["cv_tts6"],
+ lg_conf["cv_tts7"],
+ lg_conf["cv_tts8"],
+ lg_conf["cv_tts9"],
+ lg_conf["cv_tts10"],
+ lg_conf["cv_tts11"],
+ lg_conf["cv_tts12"],
+ ]
+
+ configs_storage = []
+
+ for i in range(MAX_TTS): # Loop from 00 to 11
+ with gr.Accordion(CV_SUBTITLES[i], open=False):
+ gr.Markdown(TTS_TABS[i])
+ with gr.Column():
+ tag_gui = gr.Textbox(
+ value=TTS_TABS[i], visible=False
+ )
+ model_gui = model_conf()
+ pitch_algo_gui = pitch_algo_conf()
+ pitch_lvl_gui = pitch_lvl_conf()
+ index_gui = index_conf()
+ index_inf_gui = index_inf_conf()
+ rmf_gui = respiration_filter_conf()
+ er_gui = envelope_ratio_conf()
+ cbp_gui = consonant_protec_conf()
+
+ with gr.Row(variant="compact"):
+ button_config = button_conf(
+ TTS_TABS[i]
+ )
+
+ confirm_conf = gr.HTML()
+
+ button_config.click(
+ SoniTr.vci.apply_conf,
+ inputs=[
+ tag_gui,
+ model_gui,
+ pitch_algo_gui,
+ pitch_lvl_gui,
+ index_gui,
+ index_inf_gui,
+ rmf_gui,
+ er_gui,
+ cbp_gui,
+ ],
+ outputs=[confirm_conf],
+ )
+
+ configs_storage.append({
+ "tag": tag_gui,
+ "model": model_gui,
+ "index": index_gui,
+ })
+
+ with gr.Column():
+ with gr.Accordion("Test R.V.C.", open=False):
+ with gr.Row(variant="compact"):
+ text_test = gr.Textbox(
+ label="Text",
+ value="This is an example",
+ info="write a text",
+ placeholder="...",
+ lines=5,
+ )
+ with gr.Column():
+ tts_test = gr.Dropdown(
+ sorted(SoniTr.tts_info.list_edge),
+ value="en-GB-ThomasNeural-Male",
+ label="TTS",
+ visible=True,
+ interactive=True,
+ )
+ model_test = model_conf()
+ index_test = index_conf()
+ pitch_test = pitch_lvl_conf()
+ pitch_alg_test = pitch_algo_conf()
+ with gr.Row(variant="compact"):
+ button_test = gr.Button("Test audio")
+
+ with gr.Column():
+ with gr.Row():
+ original_ttsvoice = gr.Audio()
+ ttsvoice = gr.Audio()
+
+ button_test.click(
+ SoniTr.vci.make_test,
+ inputs=[
+ text_test,
+ tts_test,
+ model_test,
+ index_test,
+ pitch_test,
+ pitch_alg_test,
+ ],
+ outputs=[ttsvoice, original_ttsvoice],
+ )
+
+ download_button.click(
+ download_list,
+ [url_links],
+ [download_finish],
+ queue=False
+ ).then(
+ update_models,
+ [],
+ [
+ elem["model"] for elem in configs_storage
+ ] + [model_test] + [
+ elem["index"] for elem in configs_storage
+ ] + [index_test],
+ )
+
+ with gr.Tab(lg_conf["tab_help"]):
+ gr.Markdown(lg_conf["tutorial"])
+ gr.Markdown(news)
+
+ def play_sound_alert(play_sound):
+
+ if not play_sound:
+ return None
+
+ # silent_sound = "assets/empty_audio.mp3"
+ sound_alert = "assets/sound_alert.mp3"
+
+ time.sleep(0.25)
+ # yield silent_sound
+ yield None
+
+ time.sleep(0.25)
+ yield sound_alert
+
+ sound_alert_notification = gr.Audio(
+ value=None,
+ type="filepath",
+ format="mp3",
+ autoplay=True,
+ visible=False,
+ )
+
+ if logs_in_gui:
+ logger.info("Logs in gui need public url")
+
+ class Logger:
+ def __init__(self, filename):
+ self.terminal = sys.stdout
+ self.log = open(filename, "w")
+
+ def write(self, message):
+ self.terminal.write(message)
+ self.log.write(message)
+
+ def flush(self):
+ self.terminal.flush()
+ self.log.flush()
+
+ def isatty(self):
+ return False
+
+ sys.stdout = Logger("output.log")
+
+ def read_logs():
+ sys.stdout.flush()
+ with open("output.log", "r") as f:
+ return f.read()
+
+ with gr.Accordion("Logs", open=False):
+ logs = gr.Textbox(label=">>>")
+ app.load(read_logs, None, logs, every=1)
+
+ if SoniTr.tts_info.xtts_enabled:
+ # Update tts list
+ def update_tts_list():
+ update_dict = {
+ f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list())
+ for i in range(MAX_TTS)
+ }
+ update_dict["tts_documents"] = gr.update(
+ choices=list(
+ filter(
+ lambda x: x != "_XTTS_/AUTOMATIC.wav",
+ SoniTr.tts_info.tts_list(),
+ )
+ )
+ )
+ return [value for value in update_dict.values()]
+
+ create_xtts_wav.click(
+ create_wav_file_vc,
+ inputs=[
+ wav_speaker_name,
+ wav_speaker_file,
+ wav_speaker_start,
+ wav_speaker_end,
+ wav_speaker_dir,
+ wav_speaker_dereverb,
+ ],
+ outputs=[wav_speaker_output],
+ ).then(
+ update_tts_list,
+ None,
+ [
+ tts_voice00,
+ tts_voice01,
+ tts_voice02,
+ tts_voice03,
+ tts_voice04,
+ tts_voice05,
+ tts_voice06,
+ tts_voice07,
+ tts_voice08,
+ tts_voice09,
+ tts_voice10,
+ tts_voice11,
+ tts_documents,
+ ],
+ )
+
+ # Run translate text
+ subs_button.click(
+ SoniTr.batch_multilingual_media_conversion,
+ inputs=[
+ video_input,
+ blink_input,
+ directory_input,
+ HFKEY,
+ PREVIEW,
+ WHISPER_MODEL_SIZE,
+ batch_size,
+ compute_type,
+ SOURCE_LANGUAGE,
+ TRANSLATE_AUDIO_TO,
+ min_speakers,
+ max_speakers,
+ tts_voice00,
+ tts_voice01,
+ tts_voice02,
+ tts_voice03,
+ tts_voice04,
+ tts_voice05,
+ tts_voice06,
+ tts_voice07,
+ tts_voice08,
+ tts_voice09,
+ tts_voice10,
+ tts_voice11,
+ VIDEO_OUTPUT_NAME,
+ AUDIO_MIX,
+ audio_accelerate,
+ acceleration_rate_regulation_gui,
+ volume_original_mix,
+ volume_translated_mix,
+ sub_type_output,
+ edit_sub_check, # TRUE BY DEFAULT
+ dummy_false_check, # dummy false
+ subs_edit_space,
+ avoid_overlap_gui,
+ vocal_refinement_gui,
+ literalize_numbers_gui,
+ segment_duration_limit_gui,
+ diarization_process_dropdown,
+ translate_process_dropdown,
+ input_srt,
+ main_output_type,
+ main_voiceless_track,
+ voice_imitation_gui,
+ voice_imitation_max_segments_gui,
+ voice_imitation_vocals_dereverb_gui,
+ voice_imitation_remove_previous_gui,
+ voice_imitation_method_gui,
+ wav_speaker_dereverb,
+ text_segmentation_scale_gui,
+ divide_text_segments_by_gui,
+ soft_subtitles_to_video_gui,
+ burn_subtitles_to_video_gui,
+ enable_cache_gui,
+ enable_custom_voice,
+ workers_custom_voice,
+ is_gui_dummy_check,
+ ],
+ outputs=subs_edit_space,
+ ).then(
+ play_sound_alert, [play_sound_gui], [sound_alert_notification]
+ )
+
+ # Run translate tts and complete
+ video_button.click(
+ SoniTr.batch_multilingual_media_conversion,
+ inputs=[
+ video_input,
+ blink_input,
+ directory_input,
+ HFKEY,
+ PREVIEW,
+ WHISPER_MODEL_SIZE,
+ batch_size,
+ compute_type,
+ SOURCE_LANGUAGE,
+ TRANSLATE_AUDIO_TO,
+ min_speakers,
+ max_speakers,
+ tts_voice00,
+ tts_voice01,
+ tts_voice02,
+ tts_voice03,
+ tts_voice04,
+ tts_voice05,
+ tts_voice06,
+ tts_voice07,
+ tts_voice08,
+ tts_voice09,
+ tts_voice10,
+ tts_voice11,
+ VIDEO_OUTPUT_NAME,
+ AUDIO_MIX,
+ audio_accelerate,
+ acceleration_rate_regulation_gui,
+ volume_original_mix,
+ volume_translated_mix,
+ sub_type_output,
+ dummy_false_check,
+ edit_sub_check,
+ subs_edit_space,
+ avoid_overlap_gui,
+ vocal_refinement_gui,
+ literalize_numbers_gui,
+ segment_duration_limit_gui,
+ diarization_process_dropdown,
+ translate_process_dropdown,
+ input_srt,
+ main_output_type,
+ main_voiceless_track,
+ voice_imitation_gui,
+ voice_imitation_max_segments_gui,
+ voice_imitation_vocals_dereverb_gui,
+ voice_imitation_remove_previous_gui,
+ voice_imitation_method_gui,
+ wav_speaker_dereverb,
+ text_segmentation_scale_gui,
+ divide_text_segments_by_gui,
+ soft_subtitles_to_video_gui,
+ burn_subtitles_to_video_gui,
+ enable_cache_gui,
+ enable_custom_voice,
+ workers_custom_voice,
+ is_gui_dummy_check,
+ ],
+ outputs=video_output,
+ trigger_mode="multiple",
+ ).then(
+ play_sound_alert, [play_sound_gui], [sound_alert_notification]
+ )
+
+ # Run docs process
+ docs_button.click(
+ SoniTr.multilingual_docs_conversion,
+ inputs=[
+ text_docs,
+ input_docs,
+ directory_input_docs,
+ docs_SOURCE_LANGUAGE,
+ docs_TRANSLATE_TO,
+ tts_documents,
+ docs_OUTPUT_NAME,
+ docs_translate_process_dropdown,
+ docs_output_type,
+ docs_chunk_size,
+ enable_custom_voice,
+ workers_custom_voice,
+ start_page_gui,
+ end_page_gui,
+ videobook_width_gui,
+ videobook_height_gui,
+ videobook_bcolor_gui,
+ docs_dummy_check,
+ ],
+ outputs=docs_output,
+ trigger_mode="multiple",
+ ).then(
+ play_sound_alert, [play_sound_gui], [sound_alert_notification]
+ )
+
+ return app
+
+
+def get_language_config(language_data, language=None, base_key="english"):
+ base_lang = language_data.get(base_key)
+
+ if language not in language_data:
+ logger.error(
+ f"Language {language} not found, defaulting to {base_key}"
+ )
+ return base_lang
+
+ lg_conf = language_data.get(language, {})
+ lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf)
+
+ return lg_conf
+
+
+def create_parser():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ parser.add_argument(
+ "--theme",
+ type=str,
+ default="Taithrah/Minimal",
+ help=(
+ "Specify the theme; find themes in "
+ "https://huggingface.co/spaces/gradio/theme-gallery;"
+ " Example: --theme aliabid94/new-theme"
+ ),
+ )
+ parser.add_argument(
+ "--public_url",
+ action="store_true",
+ default=False,
+ help="Enable public link",
+ )
+ parser.add_argument(
+ "--logs_in_gui",
+ action="store_true",
+ default=False,
+ help="Displays the operations performed in Logs",
+ )
+ parser.add_argument(
+ "--verbosity_level",
+ type=str,
+ default="info",
+ help=(
+ "Set logger verbosity level: "
+ "debug, info, warning, error, or critical"
+ ),
+ )
+ parser.add_argument(
+ "--language",
+ type=str,
+ default="english",
+ help=" Select the language of the interface: english, spanish",
+ )
+ parser.add_argument(
+ "--cpu_mode",
+ action="store_true",
+ default=False,
+ help="Enable CPU mode to run the program without utilizing GPU acceleration.",
+ )
+ return parser
+
+
+if __name__ == "__main__":
+
+ parser = create_parser()
+
+ args = parser.parse_args()
+ # Simulating command-line arguments
+ # args_list = "--theme aliabid94/new-theme --public_url".split()
+ # args = parser.parse_args(args_list)
+
+ set_logging_level(args.verbosity_level)
+
+ for id_model in UVR_MODELS:
+ download_manager(
+ os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir
+ )
+
+ models_path, index_path = upload_model_list()
+
+ SoniTr = SoniTranslate(cpu_mode=args.cpu_mode)
+
+ lg_conf = get_language_config(language_data, language=args.language)
+
+ app = create_gui(args.theme, logs_in_gui=args.logs_in_gui)
+
+ app.queue()
+
+ app.launch(
+ max_threads=1,
+ share=args.public_url,
+ show_error=True,
+ quiet=False,
+ debug=(True if logger.isEnabledFor(logging.DEBUG) else False),
+ )