Aesthetic_RVC_Inference_HF

Running

File size: 16,047 Bytes

3b7b011

import sys

sys.path.append("..")
import os

now_dir = os.getcwd()

from dotenv import load_dotenv
from lib.infer.modules.vc.modules import VC
from assets.configs.config import Config

load_dotenv()
config = Config()
vc = VC(config)

import shutil
import numpy as np
import torch

import soundfile as sf
from gtts import gTTS
import edge_tts
import asyncio
import scipy.io.wavfile as wavfile
import nltk

nltk.download("punkt", quiet=True)
from nltk.tokenize import sent_tokenize
from bark import SAMPLE_RATE

import json
import ssl
from typing import Any, Dict, List, Optional
import asyncio
import aiohttp
import certifi

VOICE_LIST = (
    "https://speech.platform.bing.com/consumer/speech/synthesize/"
    + "readaloud/voices/list?trustedclienttoken="
    + "6A5AA1D4EAFF4E9FB37E23D68491D6F4"
)
def get_bark_voice():
    mensaje = """
v2/en_speaker_0	English	Male
v2/en_speaker_1	English	Male
v2/en_speaker_2	English	Male
v2/en_speaker_3	English	Male
v2/en_speaker_4	English	Male
v2/en_speaker_5	English	Male
v2/en_speaker_6	English	Male
v2/en_speaker_7	English	Male
v2/en_speaker_8	English	Male
v2/en_speaker_9	English	Female
v2/zh_speaker_0	Chinese (Simplified)	Male
v2/zh_speaker_1	Chinese (Simplified)	Male
v2/zh_speaker_2	Chinese (Simplified)	Male
v2/zh_speaker_3	Chinese (Simplified)	Male
v2/zh_speaker_4	Chinese (Simplified)	Female
v2/zh_speaker_5	Chinese (Simplified)	Male
v2/zh_speaker_6	Chinese (Simplified)	Female
v2/zh_speaker_7	Chinese (Simplified)	Female
v2/zh_speaker_8	Chinese (Simplified)	Male
v2/zh_speaker_9	Chinese (Simplified)	Female
v2/fr_speaker_0	French	Male
v2/fr_speaker_1	French	Female
v2/fr_speaker_2	French	Female
v2/fr_speaker_3	French	Male
v2/fr_speaker_4	French	Male
v2/fr_speaker_5	French	Female
v2/fr_speaker_6	French	Male
v2/fr_speaker_7	French	Male
v2/fr_speaker_8	French	Male
v2/fr_speaker_9	French	Male
v2/de_speaker_0	German	Male
v2/de_speaker_1	German	Male
v2/de_speaker_2	German	Male
v2/de_speaker_3	German	Female
v2/de_speaker_4	German	Male
v2/de_speaker_5	German	Male
v2/de_speaker_6	German	Male
v2/de_speaker_7	German	Male
v2/de_speaker_8	German	Female
v2/de_speaker_9	German	Male
v2/hi_speaker_0	Hindi	Female
v2/hi_speaker_1	Hindi	Female
v2/hi_speaker_2	Hindi	Male
v2/hi_speaker_3	Hindi	Female
v2/hi_speaker_4	Hindi	Female
v2/hi_speaker_5	Hindi	Male
v2/hi_speaker_6	Hindi	Male
v2/hi_speaker_7	Hindi	Male
v2/hi_speaker_8	Hindi	Male
v2/hi_speaker_9	Hindi	Female
v2/it_speaker_0	Italian	Male
v2/it_speaker_1	Italian	Male
v2/it_speaker_2	Italian	Female
v2/it_speaker_3	Italian	Male
v2/it_speaker_4	Italian	Male
v2/it_speaker_5	Italian	Male
v2/it_speaker_6	Italian	Male
v2/it_speaker_7	Italian	Female
v2/it_speaker_8	Italian	Male
v2/it_speaker_9	Italian	Female
v2/ja_speaker_0	Japanese	Female
v2/ja_speaker_1	Japanese	Female
v2/ja_speaker_2	Japanese	Male
v2/ja_speaker_3	Japanese	Female
v2/ja_speaker_4	Japanese	Female
v2/ja_speaker_5	Japanese	Female
v2/ja_speaker_6	Japanese	Male
v2/ja_speaker_7	Japanese	Female
v2/ja_speaker_8	Japanese	Female
v2/ja_speaker_9	Japanese	Female
v2/ko_speaker_0	Korean	Female
v2/ko_speaker_1	Korean	Male
v2/ko_speaker_2	Korean	Male
v2/ko_speaker_3	Korean	Male
v2/ko_speaker_4	Korean	Male
v2/ko_speaker_5	Korean	Male
v2/ko_speaker_6	Korean	Male
v2/ko_speaker_7	Korean	Male
v2/ko_speaker_8	Korean	Male
v2/ko_speaker_9	Korean	Male
v2/pl_speaker_0	Polish	Male
v2/pl_speaker_1	Polish	Male
v2/pl_speaker_2	Polish	Male
v2/pl_speaker_3	Polish	Male
v2/pl_speaker_4	Polish	Female
v2/pl_speaker_5	Polish	Male
v2/pl_speaker_6	Polish	Female
v2/pl_speaker_7	Polish	Male
v2/pl_speaker_8	Polish	Male
v2/pl_speaker_9	Polish	Female
v2/pt_speaker_0	Portuguese	Male
v2/pt_speaker_1	Portuguese	Male
v2/pt_speaker_2	Portuguese	Male
v2/pt_speaker_3	Portuguese	Male
v2/pt_speaker_4	Portuguese	Male
v2/pt_speaker_5	Portuguese	Male
v2/pt_speaker_6	Portuguese	Male
v2/pt_speaker_7	Portuguese	Male
v2/pt_speaker_8	Portuguese	Male
v2/pt_speaker_9	Portuguese	Male
v2/ru_speaker_0	Russian	Male
v2/ru_speaker_1	Russian	Male
v2/ru_speaker_2	Russian	Male
v2/ru_speaker_3	Russian	Male
v2/ru_speaker_4	Russian	Male
v2/ru_speaker_5	Russian	Female
v2/ru_speaker_6	Russian	Female
v2/ru_speaker_7	Russian	Male
v2/ru_speaker_8	Russian	Male
v2/ru_speaker_9	Russian	Female
v2/es_speaker_0	Spanish	Male
v2/es_speaker_1	Spanish	Male
v2/es_speaker_2	Spanish	Male
v2/es_speaker_3	Spanish	Male
v2/es_speaker_4	Spanish	Male
v2/es_speaker_5	Spanish	Male
v2/es_speaker_6	Spanish	Male
v2/es_speaker_7	Spanish	Male
v2/es_speaker_8	Spanish	Female
v2/es_speaker_9	Spanish	Female
v2/tr_speaker_0	Turkish	Male
v2/tr_speaker_1	Turkish	Male
v2/tr_speaker_2	Turkish	Male
v2/tr_speaker_3	Turkish	Male
v2/tr_speaker_4	Turkish	Female
v2/tr_speaker_5	Turkish	Female
v2/tr_speaker_6	Turkish	Male
v2/tr_speaker_7	Turkish	Male
v2/tr_speaker_8	Turkish	Male
v2/tr_speaker_9	Turkish	Male
    """
    # Dividir el mensaje en líneas
    lineas = mensaje.split("\n")
    datos_deseados = []
    for linea in lineas:
        partes = linea.split("\t")
        if len(partes) == 3:
            clave, _, genero = partes
            datos_deseados.append(f"{clave}-{genero}")

    return datos_deseados

# ||-----------------------------------------------------------------------------------||
# ||                         Obtained from dependency edge_tts                         ||
# ||-----------------------------------------------------------------------------------||

async def list_voices(*, proxy: Optional[str] = None) -> Any:
    """
    List all available voices and their attributes.

    This pulls data from the URL used by Microsoft Edge to return a list of
    all available voices.

    Returns:
        dict: A dictionary of voice attributes.
    """
    ssl_ctx = ssl.create_default_context(cafile=certifi.where())
    async with aiohttp.ClientSession(trust_env=True) as session:
        async with session.get(
            VOICE_LIST,
            headers={
                "Authority": "speech.platform.bing.com",
                "Sec-CH-UA": '" Not;A Brand";v="99", "Microsoft Edge";v="91", "Chromium";v="91"',
                "Sec-CH-UA-Mobile": "?0",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
                "Accept": "*/*",
                "Sec-Fetch-Site": "none",
                "Sec-Fetch-Mode": "cors",
                "Sec-Fetch-Dest": "empty",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en;q=0.9",
            },
            proxy=proxy,
            ssl=ssl_ctx,
        ) as url:
            data = json.loads(await url.text())
    return data
async def create(custom_voices: Optional[List[Dict[str, Any]]] = None) -> List[Dict[str, Any]]:
    """
    Creates a list of voices with all available voices and their attributes.
    """
    voices = await list_voices() if custom_voices is None else custom_voices
    voices = [
        {**voice, **{"Language": voice["Locale"].split("-")[0]}}
        for voice in voices
    ]
    simplified_voices = [
        {'ShortName': voice['ShortName'], 'Gender': voice['Gender']}
        for voice in voices
    ]
    return simplified_voices

async def loop_main():
    voices = await create()
    voices_json = json.dumps(voices)
    return voices_json

def get_edge_voice():
    loop = asyncio.get_event_loop()
    voices_json = loop.run_until_complete(loop_main())
    voices = json.loads(voices_json)
    tts_voice = []
    for voice in voices:
        short_name = voice['ShortName']
        gender = voice['Gender']
        formatted_entry = f"{short_name}-{gender}"
        tts_voice.append(formatted_entry)
       # print(f"{short_name}-{gender}")
    return tts_voice

set_bark_voice = get_bark_voice()
set_edge_voice = get_edge_voice()

def update_tts_methods_voice(select_value):
    # ["Edge-tts", "RVG-tts", "Bark-tts"]
    if select_value == "Edge-tts":
        return {"choices": set_edge_voice, "value": "", "__type__": "update"}
    elif select_value == "Bark-tts":
        return {"choices": set_bark_voice, "value": "", "__type__": "update"}


def custom_voice(
    _values,  # filter indices
    audio_files,  # all audio files
    model_voice_path="",
    transpose=0,
    f0method="pm",
    index_rate_=float(0.66),
    crepe_hop_length_=float(64),
    f0_autotune=False,
    file_index="",
    file_index2="",
):
    vc.get_vc(model_voice_path)

    for _value_item in _values:
        filename = (
            "assets/audios/audio_outputs" + audio_files[_value_item]
            if _value_item != "converted_tts"
            else audio_files[0]
        )
        # filename = "audio2/"+audio_files[_value_item]
        try:
            print(audio_files[_value_item], model_voice_path)
        except:
            pass
        info_, (sample_, audio_output_) = vc.vc_single_dont_save(
            sid=0,
            input_audio_path1=filename,  # f"audio2/{filename}",
            f0_up_key=transpose,  # transpose for m to f and reverse 0 12
            f0_file=None,
            f0_method=f0method,
            file_index=file_index,  # dir pwd?
            file_index2=file_index2,
            # file_big_npy1,
            index_rate=index_rate_,
            filter_radius=int(3),
            resample_sr=int(0),
            rms_mix_rate=float(0.25),
            protect=float(0.33),
            crepe_hop_length=crepe_hop_length_,
            f0_autotune=f0_autotune,
            f0_min=50,
            note_min=50,
            f0_max=1100,
            note_max=1100,
        )

        sf.write(
            file=filename,  # f"audio2/{filename}",
            samplerate=sample_,
            data=audio_output_,
        )


def cast_to_device(tensor, device):
    try:
        return tensor.to(device)
    except Exception as e:
        print(e)
        return tensor


def __bark__(text, voice_preset):
    os.makedirs(os.path.join(now_dir, "tts"), exist_ok=True)
    from transformers import AutoProcessor, BarkModel

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    dtype = torch.float32 if "cpu" in device else torch.float16
    bark_processor = AutoProcessor.from_pretrained(
        "suno/bark",
        cache_dir=os.path.join(now_dir, "tts", "suno/bark"),
        torch_dtype=dtype,
    )
    bark_model = BarkModel.from_pretrained(
        "suno/bark",
        cache_dir=os.path.join(now_dir, "tts", "suno/bark"),
        torch_dtype=dtype,
    ).to(device)
    # bark_model.enable_cpu_offload()
    inputs = bark_processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
    tensor_dict = {
        k: cast_to_device(v, device) if hasattr(v, "to") else v
        for k, v in inputs.items()
    }
    speech_values = bark_model.generate(**tensor_dict, do_sample=True)
    sampling_rate = bark_model.generation_config.sample_rate
    speech = speech_values.cpu().numpy().squeeze()
    return speech, sampling_rate


def use_tts(
    tts_text,
    tts_voice,
    model_path,
    index_path,
    transpose,
    f0_method,
    index_rate,
    crepe_hop_length,
    f0_autotune,
    tts_method,
):
    if tts_voice == None:
        return

    output_folder = "assets/audios/audio-outputs"
    os.makedirs(output_folder, exist_ok=True)
    output_count = 1  # Contador para nombres de archivo únicos

    while True:
        converted_tts_filename = os.path.join(output_folder, f"tts_out_{output_count}.wav")
        bark_out_filename = os.path.join(output_folder, f"bark_out_{output_count}.wav")
        
        if not os.path.exists(converted_tts_filename) and not os.path.exists(bark_out_filename):
            break
        output_count += 1
    
    
    if "SET_LIMIT" == os.getenv("DEMO"):
        if len(tts_text) > 60:
            tts_text = tts_text[:60]
            print("DEMO; limit to 60 characters")

    language = tts_voice[:2]
    if tts_method == "Edge-tts":
        try:
            # nest_asyncio.apply() # gradio;not
            asyncio.run(
                edge_tts.Communicate(
                    tts_text, "-".join(tts_voice.split("-")[:-1])
                ).save(converted_tts_filename)
            )
        except:
            try:
                tts = gTTS(tts_text, lang=language)
                tts.save(converted_tts_filename)
                tts.save
                print(
                    f"No audio was received. Please change the tts voice for {tts_voice}. USING gTTS."
                )
            except:
                tts = gTTS("a", lang=language)
                tts.save(converted_tts_filename)
                print("Error: Audio will be replaced.")
        
        try:
            vc.get_vc(model_path)
            info_, (sample_, audio_output_) = vc.vc_single_dont_save(
                sid=0,
                input_audio_path1=converted_tts_filename,
                f0_up_key=transpose,
                f0_file=None,
                f0_method=f0_method,
                file_index="",
                file_index2=index_path,
                index_rate=index_rate,
                filter_radius=int(3),
                resample_sr=int(0),
                rms_mix_rate=float(0.25),
                protect=float(0.33),
                crepe_hop_length=crepe_hop_length,
                f0_autotune=f0_autotune,
                f0_min=50,
                note_min=50,
                f0_max=1100,
                note_max=1100,
            )

            # Genera un nombre de archivo único para el archivo procesado por vc.vc_single_dont_save
            vc_output_filename = os.path.join(output_folder, f"converted_tts_{output_count}.wav")
            
            # Guarda el archivo de audio procesado por vc.vc_single_dont_save
            wavfile.write(
                vc_output_filename,
                rate=sample_,
                data=audio_output_,
            )

            return vc_output_filename,converted_tts_filename
        except Exception as e:
            print(f"{e}")
            return None, None

    elif tts_method == "Bark-tts":
        try:
            script = tts_text.replace("\n", " ").strip()
            sentences = sent_tokenize(script)
            print(sentences)
            silence = np.zeros(int(0.25 * SAMPLE_RATE))
            pieces = []
            for sentence in sentences:
                audio_array, _ = __bark__(sentence, tts_voice.split("-")[0])
                pieces += [audio_array, silence.copy()]

            sf.write(
                file=bark_out_filename, samplerate=SAMPLE_RATE, data=np.concatenate(pieces)
            )
            vc.get_vc(model_path)
            info_, (sample_, audio_output_) = vc.vc_single_dont_save(
                sid=0,
                input_audio_path1=os.path.join(
                    now_dir, "assets", "audios", "audio-outputs", "bark_out.wav"
                ),  # f"audio2/{filename}",
                f0_up_key=transpose,  # transpose for m to f and reverse 0 12
                f0_file=None,
                f0_method=f0_method,
                file_index="",  # dir pwd?
                file_index2=index_path,
                # file_big_npy1,
                index_rate=index_rate,
                filter_radius=int(3),
                resample_sr=int(0),
                rms_mix_rate=float(0.25),
                protect=float(0.33),
                crepe_hop_length=crepe_hop_length,
                f0_autotune=f0_autotune,
                f0_min=50,
                note_min=50,
                f0_max=1100,
                note_max=1100,
            )
            
            vc_output_filename = os.path.join(output_folder, f"converted_bark_{output_count}.wav")
            
            # Guarda el archivo de audio procesado por vc.vc_single_dont_save
            wavfile.write(
                vc_output_filename,
                rate=sample_,
                data=audio_output_,
            )

            return vc_output_filename, bark_out_filename

        except Exception as e:
            print(f"{e}")
            return None, None