from gtts import gTTS import edge_tts, asyncio, json, glob # noqa from tqdm import tqdm import librosa, os, re, torch, gc, subprocess # noqa from .language_configuration import ( fix_code_language, BARK_VOICES_LIST, VITS_VOICES_LIST, ) from .utils import ( download_manager, create_directories, copy_files, rename_file, remove_directory_contents, remove_files, run_command, write_chunked, ) import numpy as np from typing import Any, Dict from pathlib import Path import soundfile as sf import platform import logging import traceback from .logging_setup import logger class TTS_OperationError(Exception): def __init__(self, message="The operation did not complete successfully."): self.message = message super().__init__(self.message) def verify_saved_file_and_size(filename): if not os.path.exists(filename): raise TTS_OperationError(f"File '{filename}' was not saved.") if os.path.getsize(filename) == 0: raise TTS_OperationError( f"File '{filename}' has a zero size. " "Related to incorrect TTS for the target language" ) def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename): traceback.print_exc() logger.error(f"Error: {str(error)}") try: from tempfile import TemporaryFile tts = gTTS(segment["text"], lang=fix_code_language(TRANSLATE_AUDIO_TO)) # tts.save(filename) f = TemporaryFile() tts.write_to_fp(f) # Reset the file pointer to the beginning of the file f.seek(0) # Read audio data from the TemporaryFile using soundfile audio_data, samplerate = sf.read(f) f.close() # Close the TemporaryFile write_chunked( filename, audio_data, samplerate, format="ogg", subtype="vorbis" ) logger.warning( 'TTS auxiliary will be utilized ' f'rather than TTS: {segment["tts_name"]}' ) verify_saved_file_and_size(filename) except Exception as error: logger.critical(f"Error: {str(error)}") sample_rate_aux = 22050 duration = float(segment["end"]) - float(segment["start"]) data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32) write_chunked( filename, data, sample_rate_aux, format="ogg", subtype="vorbis" ) logger.error("Audio will be replaced -> [silent audio].") verify_saved_file_and_size(filename) def pad_array(array, sr): if isinstance(array, list): array = np.array(array) if not array.shape[0]: raise ValueError("The generated audio does not contain any data") valid_indices = np.where(np.abs(array) > 0.001)[0] if len(valid_indices) == 0: logger.debug(f"No valid indices: {array}") return array try: pad_indice = int(0.1 * sr) start_pad = max(0, valid_indices[0] - pad_indice) end_pad = min(len(array), valid_indices[-1] + 1 + pad_indice) padded_array = array[start_pad:end_pad] return padded_array except Exception as error: logger.error(str(error)) return array # ===================================== # EDGE TTS # ===================================== def edge_tts_voices_list(): try: completed_process = subprocess.run( ["edge-tts", "--list-voices"], capture_output=True, text=True ) lines = completed_process.stdout.strip().split("\n") except Exception as error: logger.debug(str(error)) lines = [] voices = [] for line in lines: if line.startswith("Name: "): voice_entry = {} voice_entry["Name"] = line.split(": ")[1] elif line.startswith("Gender: "): voice_entry["Gender"] = line.split(": ")[1] voices.append(voice_entry) formatted_voices = [ f"{entry['Name']}-{entry['Gender']}" for entry in voices ] if not formatted_voices: logger.warning( "The list of Edge TTS voices could not be obtained, " "switching to an alternative method" ) tts_voice_list = asyncio.new_event_loop().run_until_complete( edge_tts.list_voices() ) formatted_voices = sorted( [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list] ) if not formatted_voices: logger.error("Can't get EDGE TTS - list voices") return formatted_voices def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui): for segment in tqdm(filtered_edge_segments["segments"]): speaker = segment["speaker"] # noqa text = segment["text"] start = segment["start"] tts_name = segment["tts_name"] # make the tts audio filename = f"audio/{start}.ogg" temp_file = filename[:-3] + "mp3" logger.info(f"{text} >> {filename}") try: if is_gui: asyncio.run( edge_tts.Communicate( text, "-".join(tts_name.split("-")[:-1]) ).save(temp_file) ) else: # nest_asyncio.apply() if not is_gui else None command = f'edge-tts -t "{text}" -v "{tts_name.replace("-Male", "").replace("-Female", "")}" --write-media "{temp_file}"' run_command(command) verify_saved_file_and_size(temp_file) data, sample_rate = sf.read(temp_file) data = pad_array(data, sample_rate) # os.remove(temp_file) # Save file write_chunked( file=filename, samplerate=sample_rate, data=data, format="ogg", subtype="vorbis", ) verify_saved_file_and_size(filename) except Exception as error: error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) # ===================================== # BARK TTS # ===================================== def segments_bark_tts( filtered_bark_segments, TRANSLATE_AUDIO_TO, model_id_bark="suno/bark-small" ): from transformers import AutoProcessor, BarkModel from optimum.bettertransformer import BetterTransformer device = os.environ.get("SONITR_DEVICE") torch_dtype_env = torch.float16 if device == "cuda" else torch.float32 # load model bark model = BarkModel.from_pretrained( model_id_bark, torch_dtype=torch_dtype_env ).to(device) model = model.to(device) processor = AutoProcessor.from_pretrained( model_id_bark, return_tensors="pt" ) # , padding=True if device == "cuda": # convert to bettertransformer model = BetterTransformer.transform(model, keep_original_model=False) # enable CPU offload # model.enable_cpu_offload() sampling_rate = model.generation_config.sample_rate # filtered_segments = filtered_bark_segments['segments'] # Sorting the segments by 'tts_name' # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name']) # logger.debug(sorted_segments) for segment in tqdm(filtered_bark_segments["segments"]): speaker = segment["speaker"] # noqa text = segment["text"] start = segment["start"] tts_name = segment["tts_name"] inputs = processor(text, voice_preset=BARK_VOICES_LIST[tts_name]).to( device ) # make the tts audio filename = f"audio/{start}.ogg" logger.info(f"{text} >> {filename}") try: # Infer with torch.inference_mode(): speech_output = model.generate( **inputs, do_sample=True, fine_temperature=0.4, coarse_temperature=0.8, pad_token_id=processor.tokenizer.pad_token_id, ) # Save file data_tts = pad_array( speech_output.cpu().numpy().squeeze().astype(np.float32), sampling_rate, ) write_chunked( file=filename, samplerate=sampling_rate, data=data_tts, format="ogg", subtype="vorbis", ) verify_saved_file_and_size(filename) except Exception as error: error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) gc.collect() torch.cuda.empty_cache() try: del processor del model gc.collect() torch.cuda.empty_cache() except Exception as error: logger.error(str(error)) gc.collect() torch.cuda.empty_cache() # ===================================== # VITS TTS # ===================================== def uromanize(input_string): """Convert non-Roman strings to Roman using the `uroman` perl package.""" # script_path = os.path.join(uroman_path, "bin", "uroman.pl") if not os.path.exists("./uroman"): logger.info( "Clonning repository uroman https://github.com/isi-nlp/uroman.git" " for romanize the text" ) process = subprocess.Popen( ["git", "clone", "https://github.com/isi-nlp/uroman.git"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) stdout, stderr = process.communicate() script_path = os.path.join("./uroman", "uroman", "uroman.pl") command = ["perl", script_path] process = subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) # Execute the perl command stdout, stderr = process.communicate(input=input_string.encode()) if process.returncode != 0: raise ValueError(f"Error {process.returncode}: {stderr.decode()}") # Return the output as a string and skip the new-line character at the end return stdout.decode()[:-1] def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO): from transformers import VitsModel, AutoTokenizer filtered_segments = filtered_vits_segments["segments"] # Sorting the segments by 'tts_name' sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"]) logger.debug(sorted_segments) model_name_key = None for segment in tqdm(sorted_segments): speaker = segment["speaker"] # noqa text = segment["text"] start = segment["start"] tts_name = segment["tts_name"] if tts_name != model_name_key: model_name_key = tts_name model = VitsModel.from_pretrained(VITS_VOICES_LIST[tts_name]) tokenizer = AutoTokenizer.from_pretrained( VITS_VOICES_LIST[tts_name] ) sampling_rate = model.config.sampling_rate if tokenizer.is_uroman: romanize_text = uromanize(text) logger.debug(f"Romanize text: {romanize_text}") inputs = tokenizer(romanize_text, return_tensors="pt") else: inputs = tokenizer(text, return_tensors="pt") # make the tts audio filename = f"audio/{start}.ogg" logger.info(f"{text} >> {filename}") try: # Infer with torch.no_grad(): speech_output = model(**inputs).waveform data_tts = pad_array( speech_output.cpu().numpy().squeeze().astype(np.float32), sampling_rate, ) # Save file write_chunked( file=filename, samplerate=sampling_rate, data=data_tts, format="ogg", subtype="vorbis", ) verify_saved_file_and_size(filename) except Exception as error: error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) gc.collect() torch.cuda.empty_cache() try: del tokenizer del model gc.collect() torch.cuda.empty_cache() except Exception as error: logger.error(str(error)) gc.collect() torch.cuda.empty_cache() # ===================================== # Coqui XTTS # ===================================== def coqui_xtts_voices_list(): main_folder = "_XTTS_" pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$") pattern_automatic_speaker = re.compile(r"AUTOMATIC_SPEAKER_\d+\.wav$") # List only files in the directory matching the pattern but not matching # AUTOMATIC_SPEAKER_00.wav, AUTOMATIC_SPEAKER_01.wav, etc. wav_voices = [ "_XTTS_/" + f for f in os.listdir(main_folder) if os.path.isfile(os.path.join(main_folder, f)) and pattern_coqui.match(f) and not pattern_automatic_speaker.match(f) ] return ["_XTTS_/AUTOMATIC.wav"] + wav_voices def seconds_to_hhmmss_ms(seconds): hours = seconds // 3600 minutes = (seconds % 3600) // 60 seconds = seconds % 60 milliseconds = int((seconds - int(seconds)) * 1000) return "%02d:%02d:%02d.%03d" % (hours, minutes, int(seconds), milliseconds) def audio_trimming(audio_path, destination, start, end): if isinstance(start, (int, float)): start = seconds_to_hhmmss_ms(start) if isinstance(end, (int, float)): end = seconds_to_hhmmss_ms(end) if destination: file_directory = destination else: file_directory = os.path.dirname(audio_path) file_name = os.path.splitext(os.path.basename(audio_path))[0] file_ = f"{file_name}_trim.wav" # file_ = f'{os.path.splitext(audio_path)[0]}_trim.wav' output_path = os.path.join(file_directory, file_) # -t (duration from -ss) | -to (time stop) | -af silenceremove=1:0:-50dB (remove silence) command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ss {start} -to {end} -acodec pcm_s16le -f wav "{output_path}"' run_command(command) return output_path def convert_to_xtts_good_sample(audio_path: str = "", destination: str = ""): if destination: file_directory = destination else: file_directory = os.path.dirname(audio_path) file_name = os.path.splitext(os.path.basename(audio_path))[0] file_ = f"{file_name}_good_sample.wav" # file_ = f'{os.path.splitext(audio_path)[0]}_good_sample.wav' mono_path = os.path.join(file_directory, file_) # get root command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 1 -ar 22050 -sample_fmt s16 -f wav "{mono_path}"' run_command(command) return mono_path def sanitize_file_name(file_name): import unicodedata # Normalize the string to NFKD form to separate combined characters into # base characters and diacritics normalized_name = unicodedata.normalize("NFKD", file_name) # Replace any non-ASCII characters or special symbols with an underscore sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name) return sanitized_name def create_wav_file_vc( sample_name="", # name final file audio_wav="", # path start=None, # trim start end=None, # trim end output_final_path="_XTTS_", get_vocals_dereverb=True, ): sample_name = sample_name if sample_name else "default_name" sample_name = sanitize_file_name(sample_name) audio_wav = audio_wav if isinstance(audio_wav, str) else audio_wav.name BASE_DIR = ( "." # os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) output_dir = os.path.join(BASE_DIR, "clean_song_output") # remove content # remove_directory_contents(output_dir) if start or end: # Cut file audio_segment = audio_trimming(audio_wav, output_dir, start, end) else: # Complete file audio_segment = audio_wav from .mdx_net import process_uvr_task try: _, _, _, _, audio_segment = process_uvr_task( orig_song_path=audio_segment, main_vocals=True, dereverb=get_vocals_dereverb, ) except Exception as error: logger.error(str(error)) sample = convert_to_xtts_good_sample(audio_segment) sample_name = f"{sample_name}.wav" sample_rename = rename_file(sample, sample_name) copy_files(sample_rename, output_final_path) final_sample = os.path.join(output_final_path, sample_name) if os.path.exists(final_sample): logger.info(final_sample) return final_sample else: raise Exception(f"Error wav: {final_sample}") def create_new_files_for_vc( speakers_coqui, segments_base, dereverb_automatic=True ): # before function delete automatic delete_previous_automatic output_dir = os.path.join(".", "clean_song_output") # remove content remove_directory_contents(output_dir) for speaker in speakers_coqui: filtered_speaker = [ segment for segment in segments_base if segment["speaker"] == speaker ] if len(filtered_speaker) > 4: filtered_speaker = filtered_speaker[1:] if filtered_speaker[0]["tts_name"] == "_XTTS_/AUTOMATIC.wav": name_automatic_wav = f"AUTOMATIC_{speaker}" if os.path.exists(f"_XTTS_/{name_automatic_wav}.wav"): logger.info(f"WAV automatic {speaker} exists") # path_wav = path_automatic_wav pass else: # create wav wav_ok = False for seg in filtered_speaker: duration = float(seg["end"]) - float(seg["start"]) if duration > 7.0 and duration < 12.0: logger.info( f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}' ) create_wav_file_vc( sample_name=name_automatic_wav, audio_wav="audio.wav", start=(float(seg["start"]) + 1.0), end=(float(seg["end"]) - 1.0), get_vocals_dereverb=dereverb_automatic, ) wav_ok = True break if not wav_ok: logger.info("Taking the first segment") seg = filtered_speaker[0] logger.info( f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}' ) max_duration = float(seg["end"]) - float(seg["start"]) max_duration = max(2.0, min(max_duration, 9.0)) create_wav_file_vc( sample_name=name_automatic_wav, audio_wav="audio.wav", start=(float(seg["start"])), end=(float(seg["start"]) + max_duration), get_vocals_dereverb=dereverb_automatic, ) def segments_coqui_tts( filtered_coqui_segments, TRANSLATE_AUDIO_TO, model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2", speakers_coqui=None, delete_previous_automatic=True, dereverb_automatic=True, emotion=None, ): """XTTS Install: pip install -q TTS==0.21.1 pip install -q numpy==1.23.5 Notes: - tts_name is the wav|mp3|ogg|m4a file for VC """ from TTS.api import TTS TRANSLATE_AUDIO_TO = fix_code_language(TRANSLATE_AUDIO_TO, syntax="coqui") supported_lang_coqui = [ "zh-cn", "en", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "es", "hu", "ko", "ja", ] if TRANSLATE_AUDIO_TO not in supported_lang_coqui: raise TTS_OperationError( f"'{TRANSLATE_AUDIO_TO}' is not a supported language for Coqui XTTS" ) # Emotion and speed can only be used with Coqui Studio models. discontinued # emotions = ["Neutral", "Happy", "Sad", "Angry", "Dull"] if delete_previous_automatic: for spk in speakers_coqui: remove_files(f"_XTTS_/AUTOMATIC_{spk}.wav") directory_audios_vc = "_XTTS_" create_directories(directory_audios_vc) create_new_files_for_vc( speakers_coqui, filtered_coqui_segments["segments"], dereverb_automatic, ) # Init TTS device = os.environ.get("SONITR_DEVICE") model = TTS(model_id_coqui).to(device) sampling_rate = 24000 # filtered_segments = filtered_coqui_segments['segments'] # Sorting the segments by 'tts_name' # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name']) # logger.debug(sorted_segments) for segment in tqdm(filtered_coqui_segments["segments"]): speaker = segment["speaker"] text = segment["text"] start = segment["start"] tts_name = segment["tts_name"] if tts_name == "_XTTS_/AUTOMATIC.wav": tts_name = f"_XTTS_/AUTOMATIC_{speaker}.wav" # make the tts audio filename = f"audio/{start}.ogg" logger.info(f"{text} >> {filename}") try: # Infer wav = model.tts( text=text, speaker_wav=tts_name, language=TRANSLATE_AUDIO_TO ) data_tts = pad_array( wav, sampling_rate, ) # Save file write_chunked( file=filename, samplerate=sampling_rate, data=data_tts, format="ogg", subtype="vorbis", ) verify_saved_file_and_size(filename) except Exception as error: error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) gc.collect() torch.cuda.empty_cache() try: del model gc.collect() torch.cuda.empty_cache() except Exception as error: logger.error(str(error)) gc.collect() torch.cuda.empty_cache() # ===================================== # PIPER TTS # ===================================== def piper_tts_voices_list(): file_path = download_manager( url="https://huggingface.co/rhasspy/piper-voices/resolve/main/voices.json", path="./PIPER_MODELS", ) with open(file_path, "r", encoding="utf8") as file: data = json.load(file) piper_id_models = [key + " VITS-onnx" for key in data.keys()] return piper_id_models def replace_text_in_json(file_path, key_to_replace, new_text, condition=None): # Read the JSON file with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) # Modify the specified key's value with the new text if key_to_replace in data: if condition: value_condition = condition else: value_condition = data[key_to_replace] if data[key_to_replace] == value_condition: data[key_to_replace] = new_text # Write the modified content back to the JSON file with open(file_path, "w") as file: json.dump( data, file, indent=2 ) # Write the modified data back to the file with indentation for readability def load_piper_model( model: str, data_dir: list, download_dir: str = "", update_voices: bool = False, ): from piper import PiperVoice from piper.download import ensure_voice_exists, find_voice, get_voices try: import onnxruntime as rt if rt.get_device() == "GPU" and os.environ.get("SONITR_DEVICE") == "cuda": logger.debug("onnxruntime device > GPU") cuda = True else: logger.info( "onnxruntime device > CPU" ) # try pip install onnxruntime-gpu cuda = False except Exception as error: raise TTS_OperationError(f"onnxruntime error: {str(error)}") # Disable CUDA in Windows if platform.system() == "Windows": logger.info("Employing CPU exclusivity with Piper TTS") cuda = False if not download_dir: # Download to first data directory by default download_dir = data_dir[0] else: data_dir = [os.path.join(data_dir[0], download_dir)] # Download voice if file doesn't exist model_path = Path(model) if not model_path.exists(): # Load voice info voices_info = get_voices(download_dir, update_voices=update_voices) # Resolve aliases for backwards compatibility with old voice names aliases_info: Dict[str, Any] = {} for voice_info in voices_info.values(): for voice_alias in voice_info.get("aliases", []): aliases_info[voice_alias] = {"_is_alias": True, **voice_info} voices_info.update(aliases_info) ensure_voice_exists(model, data_dir, download_dir, voices_info) model, config = find_voice(model, data_dir) replace_text_in_json( config, "phoneme_type", "espeak", "PhonemeType.ESPEAK" ) # Load voice voice = PiperVoice.load(model, config_path=config, use_cuda=cuda) return voice def synthesize_text_to_audio_np_array(voice, text, synthesize_args): audio_stream = voice.synthesize_stream_raw(text, **synthesize_args) # Collect the audio bytes into a single NumPy array audio_data = b"" for audio_bytes in audio_stream: audio_data += audio_bytes # Ensure correct data type and convert audio bytes to NumPy array audio_np = np.frombuffer(audio_data, dtype=np.int16) return audio_np def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO): """ Install: pip install -q piper-tts==1.2.0 onnxruntime-gpu # for cuda118 """ data_dir = [ str(Path.cwd()) ] # "Data directory to check for downloaded models (default: current directory)" download_dir = "PIPER_MODELS" # model_name = "en_US-lessac-medium" tts_name in a dict like VITS update_voices = True # "Download latest voices.json during startup", synthesize_args = { "speaker_id": None, "length_scale": 1.0, "noise_scale": 0.667, "noise_w": 0.8, "sentence_silence": 0.0, } filtered_segments = filtered_onnx_vits_segments["segments"] # Sorting the segments by 'tts_name' sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"]) logger.debug(sorted_segments) model_name_key = None for segment in tqdm(sorted_segments): speaker = segment["speaker"] # noqa text = segment["text"] start = segment["start"] tts_name = segment["tts_name"].replace(" VITS-onnx", "") if tts_name != model_name_key: model_name_key = tts_name model = load_piper_model( tts_name, data_dir, download_dir, update_voices ) sampling_rate = model.config.sample_rate # make the tts audio filename = f"audio/{start}.ogg" logger.info(f"{text} >> {filename}") try: # Infer speech_output = synthesize_text_to_audio_np_array( model, text, synthesize_args ) data_tts = pad_array( speech_output, # .cpu().numpy().squeeze().astype(np.float32), sampling_rate, ) # Save file write_chunked( file=filename, samplerate=sampling_rate, data=data_tts, format="ogg", subtype="vorbis", ) verify_saved_file_and_size(filename) except Exception as error: error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) gc.collect() torch.cuda.empty_cache() try: del model gc.collect() torch.cuda.empty_cache() except Exception as error: logger.error(str(error)) gc.collect() torch.cuda.empty_cache() # ===================================== # CLOSEAI TTS # ===================================== def segments_openai_tts( filtered_openai_tts_segments, TRANSLATE_AUDIO_TO ): from openai import OpenAI client = OpenAI() sampling_rate = 24000 # filtered_segments = filtered_openai_tts_segments['segments'] # Sorting the segments by 'tts_name' # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name']) for segment in tqdm(filtered_openai_tts_segments["segments"]): speaker = segment["speaker"] # noqa text = segment["text"].strip() start = segment["start"] tts_name = segment["tts_name"] # make the tts audio filename = f"audio/{start}.ogg" logger.info(f"{text} >> {filename}") try: # Request response = client.audio.speech.create( model="tts-1-hd" if "HD" in tts_name else "tts-1", voice=tts_name.split()[0][1:], response_format="wav", input=text ) audio_bytes = b'' for data in response.iter_bytes(chunk_size=4096): audio_bytes += data speech_output = np.frombuffer(audio_bytes, dtype=np.int16) # Save file data_tts = pad_array( speech_output[240:], sampling_rate, ) write_chunked( file=filename, samplerate=sampling_rate, data=data_tts, format="ogg", subtype="vorbis", ) verify_saved_file_and_size(filename) except Exception as error: error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) # ===================================== # Select task TTS # ===================================== def find_spkr(pattern, speaker_to_voice, segments): return [ speaker for speaker, voice in speaker_to_voice.items() if pattern.match(voice) and any( segment["speaker"] == speaker for segment in segments ) ] def filter_by_speaker(speakers, segments): return { "segments": [ segment for segment in segments if segment["speaker"] in speakers ] } def audio_segmentation_to_voice( result_diarize, TRANSLATE_AUDIO_TO, is_gui, tts_voice00, tts_voice01="", tts_voice02="", tts_voice03="", tts_voice04="", tts_voice05="", tts_voice06="", tts_voice07="", tts_voice08="", tts_voice09="", tts_voice10="", tts_voice11="", dereverb_automatic=True, model_id_bark="suno/bark-small", model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2", delete_previous_automatic=True, ): remove_directory_contents("audio") # Mapping speakers to voice variables speaker_to_voice = { "SPEAKER_00": tts_voice00, "SPEAKER_01": tts_voice01, "SPEAKER_02": tts_voice02, "SPEAKER_03": tts_voice03, "SPEAKER_04": tts_voice04, "SPEAKER_05": tts_voice05, "SPEAKER_06": tts_voice06, "SPEAKER_07": tts_voice07, "SPEAKER_08": tts_voice08, "SPEAKER_09": tts_voice09, "SPEAKER_10": tts_voice10, "SPEAKER_11": tts_voice11, } # Assign 'SPEAKER_00' to segments without a 'speaker' key for segment in result_diarize["segments"]: if "speaker" not in segment: segment["speaker"] = "SPEAKER_00" logger.warning( "NO SPEAKER DETECT IN SEGMENT: First TTS will be used in the" f" segment time {segment['start'], segment['text']}" ) # Assign the TTS name segment["tts_name"] = speaker_to_voice[segment["speaker"]] # Find TTS method pattern_edge = re.compile(r".*-(Male|Female)$") pattern_bark = re.compile(r".* BARK$") pattern_vits = re.compile(r".* VITS$") pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$") pattern_vits_onnx = re.compile(r".* VITS-onnx$") pattern_openai_tts = re.compile(r".* OpenAI-TTS$") all_segments = result_diarize["segments"] speakers_edge = find_spkr(pattern_edge, speaker_to_voice, all_segments) speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments) speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments) speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments) speakers_vits_onnx = find_spkr( pattern_vits_onnx, speaker_to_voice, all_segments ) speakers_openai_tts = find_spkr( pattern_openai_tts, speaker_to_voice, all_segments ) # Filter method in segments filtered_edge = filter_by_speaker(speakers_edge, all_segments) filtered_bark = filter_by_speaker(speakers_bark, all_segments) filtered_vits = filter_by_speaker(speakers_vits, all_segments) filtered_coqui = filter_by_speaker(speakers_coqui, all_segments) filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments) filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments) # Infer if filtered_edge["segments"]: logger.info(f"EDGE TTS: {speakers_edge}") segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3 if filtered_bark["segments"]: logger.info(f"BARK TTS: {speakers_bark}") segments_bark_tts( filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark ) # wav if filtered_vits["segments"]: logger.info(f"VITS TTS: {speakers_vits}") segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav if filtered_coqui["segments"]: logger.info(f"Coqui TTS: {speakers_coqui}") segments_coqui_tts( filtered_coqui, TRANSLATE_AUDIO_TO, model_id_coqui, speakers_coqui, delete_previous_automatic, dereverb_automatic, ) # wav if filtered_vits_onnx["segments"]: logger.info(f"PIPER TTS: {speakers_vits_onnx}") segments_vits_onnx_tts(filtered_vits_onnx, TRANSLATE_AUDIO_TO) # wav if filtered_openai_tts["segments"]: logger.info(f"OpenAI TTS: {speakers_openai_tts}") segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav [result.pop("tts_name", None) for result in result_diarize["segments"]] return [ speakers_edge, speakers_bark, speakers_vits, speakers_coqui, speakers_vits_onnx, speakers_openai_tts ] def accelerate_segments( result_diarize, max_accelerate_audio, valid_speakers, acceleration_rate_regulation=False, folder_output="audio2", ): logger.info("Apply acceleration") ( speakers_edge, speakers_bark, speakers_vits, speakers_coqui, speakers_vits_onnx, speakers_openai_tts ) = valid_speakers create_directories(f"{folder_output}/audio/") remove_directory_contents(f"{folder_output}/audio/") audio_files = [] speakers_list = [] max_count_segments_idx = len(result_diarize["segments"]) - 1 for i, segment in tqdm(enumerate(result_diarize["segments"])): text = segment["text"] # noqa start = segment["start"] end = segment["end"] speaker = segment["speaker"] # find name audio # if speaker in speakers_edge: filename = f"audio/{start}.ogg" # elif speaker in speakers_bark + speakers_vits + speakers_coqui + speakers_vits_onnx: # filename = f"audio/{start}.wav" # wav # duration duration_true = end - start duration_tts = librosa.get_duration(filename=filename) # Accelerate percentage acc_percentage = duration_tts / duration_true # Smoth if acceleration_rate_regulation and acc_percentage >= 1.3: try: next_segment = result_diarize["segments"][ min(max_count_segments_idx, i + 1) ] next_start = next_segment["start"] next_speaker = next_segment["speaker"] duration_with_next_start = next_start - start if duration_with_next_start > duration_true: extra_time = duration_with_next_start - duration_true if speaker == next_speaker: # half smoth_duration = duration_true + (extra_time * 0.5) else: # 7/10 smoth_duration = duration_true + (extra_time * 0.7) logger.debug( f"Base acc: {acc_percentage}, " f"smoth acc: {duration_tts / smoth_duration}" ) acc_percentage = max(1.2, (duration_tts / smoth_duration)) except Exception as error: logger.error(str(error)) if acc_percentage > max_accelerate_audio: acc_percentage = max_accelerate_audio elif acc_percentage <= 1.15 and acc_percentage >= 0.8: acc_percentage = 1.0 elif acc_percentage <= 0.79: acc_percentage = 0.8 # Round acc_percentage = round(acc_percentage + 0.0, 1) # Format read if need if speaker in speakers_edge: info_enc = sf.info(filename).format else: info_enc = "OGG" # Apply aceleration or opposite to the audio file in folder_output folder if acc_percentage == 1.0 and info_enc == "OGG": copy_files(filename, f"{folder_output}{os.sep}audio") else: os.system( f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={acc_percentage} {folder_output}/{filename}" ) if logger.isEnabledFor(logging.DEBUG): duration_create = librosa.get_duration( filename=f"{folder_output}/{filename}" ) logger.debug( f"acc_percen is {acc_percentage}, tts duration " f"is {duration_tts}, new duration is {duration_create}" f", for {filename}" ) audio_files.append(f"{folder_output}/{filename}") speaker = "TTS Speaker {:02d}".format(int(speaker[-2:]) + 1) speakers_list.append(speaker) return audio_files, speakers_list # ===================================== # Tone color converter # ===================================== def se_process_audio_segments( source_seg, tone_color_converter, device, remove_previous_processed=True ): # list wav seg source_audio_segs = glob.glob(f"{source_seg}/*.wav") if not source_audio_segs: raise ValueError( f"No audio segments found in {str(source_audio_segs)}" ) source_se_path = os.path.join(source_seg, "se.pth") # if exist not create wav if os.path.isfile(source_se_path): se = torch.load(source_se_path).to(device) logger.debug(f"Previous created {source_se_path}") else: se = tone_color_converter.extract_se(source_audio_segs, source_se_path) return se def create_wav_vc( valid_speakers, segments_base, audio_name, max_segments=10, target_dir="processed", get_vocals_dereverb=False, ): # valid_speakers = list({item['speaker'] for item in segments_base}) # Before function delete automatic delete_previous_automatic output_dir = os.path.join(".", target_dir) # remove content # remove_directory_contents(output_dir) path_source_segments = [] path_target_segments = [] for speaker in valid_speakers: filtered_speaker = [ segment for segment in segments_base if segment["speaker"] == speaker ] if len(filtered_speaker) > 4: filtered_speaker = filtered_speaker[1:] dir_name_speaker = speaker + audio_name dir_name_speaker_tts = "tts" + speaker + audio_name dir_path_speaker = os.path.join(output_dir, dir_name_speaker) dir_path_speaker_tts = os.path.join(output_dir, dir_name_speaker_tts) create_directories([dir_path_speaker, dir_path_speaker_tts]) path_target_segments.append(dir_path_speaker) path_source_segments.append(dir_path_speaker_tts) # create wav max_segments_count = 0 for seg in filtered_speaker: duration = float(seg["end"]) - float(seg["start"]) if duration > 3.0 and duration < 18.0: logger.info( f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}' ) name_new_wav = str(seg["start"]) check_segment_audio_target_file = os.path.join( dir_path_speaker, f"{name_new_wav}.wav" ) if os.path.exists(check_segment_audio_target_file): logger.debug( "Segment vc source exists: " f"{check_segment_audio_target_file}" ) pass else: create_wav_file_vc( sample_name=name_new_wav, audio_wav="audio.wav", start=(float(seg["start"]) + 1.0), end=(float(seg["end"]) - 1.0), output_final_path=dir_path_speaker, get_vocals_dereverb=get_vocals_dereverb, ) file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg" # copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts) convert_to_xtts_good_sample( file_name_tts, dir_path_speaker_tts ) max_segments_count += 1 if max_segments_count == max_segments: break if max_segments_count == 0: logger.info("Taking the first segment") seg = filtered_speaker[0] logger.info( f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}' ) max_duration = float(seg["end"]) - float(seg["start"]) max_duration = max(1.0, min(max_duration, 18.0)) name_new_wav = str(seg["start"]) create_wav_file_vc( sample_name=name_new_wav, audio_wav="audio.wav", start=(float(seg["start"])), end=(float(seg["start"]) + max_duration), output_final_path=dir_path_speaker, get_vocals_dereverb=get_vocals_dereverb, ) file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg" # copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts) convert_to_xtts_good_sample(file_name_tts, dir_path_speaker_tts) logger.debug(f"Base: {str(path_source_segments)}") logger.debug(f"Target: {str(path_target_segments)}") return path_source_segments, path_target_segments def toneconverter_openvoice( result_diarize, preprocessor_max_segments, remove_previous_process=True, get_vocals_dereverb=False, model="openvoice", ): audio_path = "audio.wav" # se_path = "se.pth" target_dir = "processed" create_directories(target_dir) from openvoice import se_extractor from openvoice.api import ToneColorConverter audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}" # se_path = os.path.join(target_dir, audio_name, 'se.pth') # create wav seg original and target valid_speakers = list( {item["speaker"] for item in result_diarize["segments"]} ) logger.info("Openvoice preprocessor...") if remove_previous_process: remove_directory_contents(target_dir) path_source_segments, path_target_segments = create_wav_vc( valid_speakers, result_diarize["segments"], audio_name, max_segments=preprocessor_max_segments, get_vocals_dereverb=get_vocals_dereverb, ) logger.info("Openvoice loading model...") model_path_openvoice = "./OPENVOICE_MODELS" url_model_openvoice = "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter" if "v2" in model: model_path = os.path.join(model_path_openvoice, "v2") url_model_openvoice = url_model_openvoice.replace( "OpenVoice", "OpenVoiceV2" ).replace("checkpoints/", "") else: model_path = os.path.join(model_path_openvoice, "v1") create_directories(model_path) config_url = f"{url_model_openvoice}/config.json" checkpoint_url = f"{url_model_openvoice}/checkpoint.pth" config_path = download_manager(url=config_url, path=model_path) checkpoint_path = download_manager( url=checkpoint_url, path=model_path ) device = os.environ.get("SONITR_DEVICE") tone_color_converter = ToneColorConverter(config_path, device=device) tone_color_converter.load_ckpt(checkpoint_path) logger.info("Openvoice tone color converter:") global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress") for source_seg, target_seg, speaker in zip( path_source_segments, path_target_segments, valid_speakers ): # source_se_path = os.path.join(source_seg, 'se.pth') source_se = se_process_audio_segments(source_seg, tone_color_converter, device) # target_se_path = os.path.join(target_seg, 'se.pth') target_se = se_process_audio_segments(target_seg, tone_color_converter, device) # Iterate throw segments encode_message = "@MyShell" filtered_speaker = [ segment for segment in result_diarize["segments"] if segment["speaker"] == speaker ] for seg in filtered_speaker: src_path = ( save_path ) = f"audio2/audio/{str(seg['start'])}.ogg" # overwrite logger.debug(f"{src_path}") tone_color_converter.convert( audio_src_path=src_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message, ) global_progress_bar.update(1) global_progress_bar.close() try: del tone_color_converter gc.collect() torch.cuda.empty_cache() except Exception as error: logger.error(str(error)) gc.collect() torch.cuda.empty_cache() def toneconverter_freevc( result_diarize, remove_previous_process=True, get_vocals_dereverb=False, ): audio_path = "audio.wav" target_dir = "processed" create_directories(target_dir) from openvoice import se_extractor audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}" # create wav seg; original is target and dubbing is source valid_speakers = list( {item["speaker"] for item in result_diarize["segments"]} ) logger.info("FreeVC preprocessor...") if remove_previous_process: remove_directory_contents(target_dir) path_source_segments, path_target_segments = create_wav_vc( valid_speakers, result_diarize["segments"], audio_name, max_segments=1, get_vocals_dereverb=get_vocals_dereverb, ) logger.info("FreeVC loading model...") device_id = os.environ.get("SONITR_DEVICE") device = None if device_id == "cpu" else device_id try: from TTS.api import TTS tts = TTS( model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False ).to(device) except Exception as error: logger.error(str(error)) logger.error("Error loading the FreeVC model.") return logger.info("FreeVC process:") global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress") for source_seg, target_seg, speaker in zip( path_source_segments, path_target_segments, valid_speakers ): filtered_speaker = [ segment for segment in result_diarize["segments"] if segment["speaker"] == speaker ] files_and_directories = os.listdir(target_seg) wav_files = [file for file in files_and_directories if file.endswith(".wav")] original_wav_audio_segment = os.path.join(target_seg, wav_files[0]) for seg in filtered_speaker: src_path = ( save_path ) = f"audio2/audio/{str(seg['start'])}.ogg" # overwrite logger.debug(f"{src_path} - {original_wav_audio_segment}") wav = tts.voice_conversion( source_wav=src_path, target_wav=original_wav_audio_segment, ) write_chunked( file=save_path, samplerate=tts.voice_converter.vc_config.audio.output_sample_rate, data=wav, format="ogg", subtype="vorbis", ) global_progress_bar.update(1) global_progress_bar.close() try: del tts gc.collect() torch.cuda.empty_cache() except Exception as error: logger.error(str(error)) gc.collect() torch.cuda.empty_cache() def toneconverter( result_diarize, preprocessor_max_segments, remove_previous_process=True, get_vocals_dereverb=False, method_vc="freevc" ): if method_vc == "freevc": if preprocessor_max_segments > 1: logger.info("FreeVC only uses one segment.") return toneconverter_freevc( result_diarize, remove_previous_process=remove_previous_process, get_vocals_dereverb=get_vocals_dereverb, ) elif "openvoice" in method_vc: return toneconverter_openvoice( result_diarize, preprocessor_max_segments, remove_previous_process=remove_previous_process, get_vocals_dereverb=get_vocals_dereverb, model=method_vc, ) if __name__ == "__main__": from segments import result_diarize audio_segmentation_to_voice( result_diarize, TRANSLATE_AUDIO_TO="en", max_accelerate_audio=2.1, is_gui=True, tts_voice00="en-facebook-mms VITS", tts_voice01="en-CA-ClaraNeural-Female", tts_voice02="en-GB-ThomasNeural-Male", tts_voice03="en-GB-SoniaNeural-Female", tts_voice04="en-NZ-MitchellNeural-Male", tts_voice05="en-GB-MaisieNeural-Female", )