Spaces:
Sleeping
Sleeping
import sys | |
sys.path.append("..") | |
import os | |
now_dir = os.getcwd() | |
from dotenv import load_dotenv | |
from lib.infer.modules.vc.modules import VC | |
from assets.configs.config import Config | |
load_dotenv() | |
config = Config() | |
vc = VC(config) | |
import shutil | |
import numpy as np | |
import torch | |
import soundfile as sf | |
from gtts import gTTS | |
import edge_tts | |
import asyncio | |
import scipy.io.wavfile as wavfile | |
import nltk | |
nltk.download("punkt", quiet=True) | |
from nltk.tokenize import sent_tokenize | |
from bark import SAMPLE_RATE | |
import json | |
import ssl | |
from typing import Any, Dict, List, Optional | |
import asyncio | |
import aiohttp | |
import certifi | |
VOICE_LIST = ( | |
"https://speech.platform.bing.com/consumer/speech/synthesize/" | |
+ "readaloud/voices/list?trustedclienttoken=" | |
+ "6A5AA1D4EAFF4E9FB37E23D68491D6F4" | |
) | |
def get_bark_voice(): | |
mensaje = """ | |
v2/en_speaker_0 English Male | |
v2/en_speaker_1 English Male | |
v2/en_speaker_2 English Male | |
v2/en_speaker_3 English Male | |
v2/en_speaker_4 English Male | |
v2/en_speaker_5 English Male | |
v2/en_speaker_6 English Male | |
v2/en_speaker_7 English Male | |
v2/en_speaker_8 English Male | |
v2/en_speaker_9 English Female | |
v2/zh_speaker_0 Chinese (Simplified) Male | |
v2/zh_speaker_1 Chinese (Simplified) Male | |
v2/zh_speaker_2 Chinese (Simplified) Male | |
v2/zh_speaker_3 Chinese (Simplified) Male | |
v2/zh_speaker_4 Chinese (Simplified) Female | |
v2/zh_speaker_5 Chinese (Simplified) Male | |
v2/zh_speaker_6 Chinese (Simplified) Female | |
v2/zh_speaker_7 Chinese (Simplified) Female | |
v2/zh_speaker_8 Chinese (Simplified) Male | |
v2/zh_speaker_9 Chinese (Simplified) Female | |
v2/fr_speaker_0 French Male | |
v2/fr_speaker_1 French Female | |
v2/fr_speaker_2 French Female | |
v2/fr_speaker_3 French Male | |
v2/fr_speaker_4 French Male | |
v2/fr_speaker_5 French Female | |
v2/fr_speaker_6 French Male | |
v2/fr_speaker_7 French Male | |
v2/fr_speaker_8 French Male | |
v2/fr_speaker_9 French Male | |
v2/de_speaker_0 German Male | |
v2/de_speaker_1 German Male | |
v2/de_speaker_2 German Male | |
v2/de_speaker_3 German Female | |
v2/de_speaker_4 German Male | |
v2/de_speaker_5 German Male | |
v2/de_speaker_6 German Male | |
v2/de_speaker_7 German Male | |
v2/de_speaker_8 German Female | |
v2/de_speaker_9 German Male | |
v2/hi_speaker_0 Hindi Female | |
v2/hi_speaker_1 Hindi Female | |
v2/hi_speaker_2 Hindi Male | |
v2/hi_speaker_3 Hindi Female | |
v2/hi_speaker_4 Hindi Female | |
v2/hi_speaker_5 Hindi Male | |
v2/hi_speaker_6 Hindi Male | |
v2/hi_speaker_7 Hindi Male | |
v2/hi_speaker_8 Hindi Male | |
v2/hi_speaker_9 Hindi Female | |
v2/it_speaker_0 Italian Male | |
v2/it_speaker_1 Italian Male | |
v2/it_speaker_2 Italian Female | |
v2/it_speaker_3 Italian Male | |
v2/it_speaker_4 Italian Male | |
v2/it_speaker_5 Italian Male | |
v2/it_speaker_6 Italian Male | |
v2/it_speaker_7 Italian Female | |
v2/it_speaker_8 Italian Male | |
v2/it_speaker_9 Italian Female | |
v2/ja_speaker_0 Japanese Female | |
v2/ja_speaker_1 Japanese Female | |
v2/ja_speaker_2 Japanese Male | |
v2/ja_speaker_3 Japanese Female | |
v2/ja_speaker_4 Japanese Female | |
v2/ja_speaker_5 Japanese Female | |
v2/ja_speaker_6 Japanese Male | |
v2/ja_speaker_7 Japanese Female | |
v2/ja_speaker_8 Japanese Female | |
v2/ja_speaker_9 Japanese Female | |
v2/ko_speaker_0 Korean Female | |
v2/ko_speaker_1 Korean Male | |
v2/ko_speaker_2 Korean Male | |
v2/ko_speaker_3 Korean Male | |
v2/ko_speaker_4 Korean Male | |
v2/ko_speaker_5 Korean Male | |
v2/ko_speaker_6 Korean Male | |
v2/ko_speaker_7 Korean Male | |
v2/ko_speaker_8 Korean Male | |
v2/ko_speaker_9 Korean Male | |
v2/pl_speaker_0 Polish Male | |
v2/pl_speaker_1 Polish Male | |
v2/pl_speaker_2 Polish Male | |
v2/pl_speaker_3 Polish Male | |
v2/pl_speaker_4 Polish Female | |
v2/pl_speaker_5 Polish Male | |
v2/pl_speaker_6 Polish Female | |
v2/pl_speaker_7 Polish Male | |
v2/pl_speaker_8 Polish Male | |
v2/pl_speaker_9 Polish Female | |
v2/pt_speaker_0 Portuguese Male | |
v2/pt_speaker_1 Portuguese Male | |
v2/pt_speaker_2 Portuguese Male | |
v2/pt_speaker_3 Portuguese Male | |
v2/pt_speaker_4 Portuguese Male | |
v2/pt_speaker_5 Portuguese Male | |
v2/pt_speaker_6 Portuguese Male | |
v2/pt_speaker_7 Portuguese Male | |
v2/pt_speaker_8 Portuguese Male | |
v2/pt_speaker_9 Portuguese Male | |
v2/ru_speaker_0 Russian Male | |
v2/ru_speaker_1 Russian Male | |
v2/ru_speaker_2 Russian Male | |
v2/ru_speaker_3 Russian Male | |
v2/ru_speaker_4 Russian Male | |
v2/ru_speaker_5 Russian Female | |
v2/ru_speaker_6 Russian Female | |
v2/ru_speaker_7 Russian Male | |
v2/ru_speaker_8 Russian Male | |
v2/ru_speaker_9 Russian Female | |
v2/es_speaker_0 Spanish Male | |
v2/es_speaker_1 Spanish Male | |
v2/es_speaker_2 Spanish Male | |
v2/es_speaker_3 Spanish Male | |
v2/es_speaker_4 Spanish Male | |
v2/es_speaker_5 Spanish Male | |
v2/es_speaker_6 Spanish Male | |
v2/es_speaker_7 Spanish Male | |
v2/es_speaker_8 Spanish Female | |
v2/es_speaker_9 Spanish Female | |
v2/tr_speaker_0 Turkish Male | |
v2/tr_speaker_1 Turkish Male | |
v2/tr_speaker_2 Turkish Male | |
v2/tr_speaker_3 Turkish Male | |
v2/tr_speaker_4 Turkish Female | |
v2/tr_speaker_5 Turkish Female | |
v2/tr_speaker_6 Turkish Male | |
v2/tr_speaker_7 Turkish Male | |
v2/tr_speaker_8 Turkish Male | |
v2/tr_speaker_9 Turkish Male | |
""" | |
# Dividir el mensaje en líneas | |
lineas = mensaje.split("\n") | |
datos_deseados = [] | |
for linea in lineas: | |
partes = linea.split("\t") | |
if len(partes) == 3: | |
clave, _, genero = partes | |
datos_deseados.append(f"{clave}-{genero}") | |
return datos_deseados | |
# ||-----------------------------------------------------------------------------------|| | |
# || Obtained from dependency edge_tts || | |
# ||-----------------------------------------------------------------------------------|| | |
async def list_voices(*, proxy: Optional[str] = None) -> Any: | |
""" | |
List all available voices and their attributes. | |
This pulls data from the URL used by Microsoft Edge to return a list of | |
all available voices. | |
Returns: | |
dict: A dictionary of voice attributes. | |
""" | |
ssl_ctx = ssl.create_default_context(cafile=certifi.where()) | |
async with aiohttp.ClientSession(trust_env=True) as session: | |
async with session.get( | |
VOICE_LIST, | |
headers={ | |
"Authority": "speech.platform.bing.com", | |
"Sec-CH-UA": '" Not;A Brand";v="99", "Microsoft Edge";v="91", "Chromium";v="91"', | |
"Sec-CH-UA-Mobile": "?0", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " | |
"(KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41", | |
"Accept": "*/*", | |
"Sec-Fetch-Site": "none", | |
"Sec-Fetch-Mode": "cors", | |
"Sec-Fetch-Dest": "empty", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Accept-Language": "en-US,en;q=0.9", | |
}, | |
proxy=proxy, | |
ssl=ssl_ctx, | |
) as url: | |
data = json.loads(await url.text()) | |
return data | |
async def create(custom_voices: Optional[List[Dict[str, Any]]] = None) -> List[Dict[str, Any]]: | |
""" | |
Creates a list of voices with all available voices and their attributes. | |
""" | |
voices = await list_voices() if custom_voices is None else custom_voices | |
voices = [ | |
{**voice, **{"Language": voice["Locale"].split("-")[0]}} | |
for voice in voices | |
] | |
simplified_voices = [ | |
{'ShortName': voice['ShortName'], 'Gender': voice['Gender']} | |
for voice in voices | |
] | |
return simplified_voices | |
async def loop_main(): | |
voices = await create() | |
voices_json = json.dumps(voices) | |
return voices_json | |
def get_edge_voice(): | |
loop = asyncio.get_event_loop() | |
voices_json = loop.run_until_complete(loop_main()) | |
voices = json.loads(voices_json) | |
tts_voice = [] | |
for voice in voices: | |
short_name = voice['ShortName'] | |
gender = voice['Gender'] | |
formatted_entry = f"{short_name}-{gender}" | |
tts_voice.append(formatted_entry) | |
# print(f"{short_name}-{gender}") | |
return tts_voice | |
set_bark_voice = get_bark_voice() | |
set_edge_voice = get_edge_voice() | |
def update_tts_methods_voice(select_value): | |
# ["Edge-tts", "RVG-tts", "Bark-tts"] | |
if select_value == "Edge-tts": | |
return {"choices": set_edge_voice, "value": "", "__type__": "update"} | |
elif select_value == "Bark-tts": | |
return {"choices": set_bark_voice, "value": "", "__type__": "update"} | |
def custom_voice( | |
_values, # filter indices | |
audio_files, # all audio files | |
model_voice_path="", | |
transpose=0, | |
f0method="pm", | |
index_rate_=float(0.66), | |
crepe_hop_length_=float(64), | |
f0_autotune=False, | |
file_index="", | |
file_index2="", | |
): | |
vc.get_vc(model_voice_path) | |
for _value_item in _values: | |
filename = ( | |
"assets/audios/audio_outputs" + audio_files[_value_item] | |
if _value_item != "converted_tts" | |
else audio_files[0] | |
) | |
# filename = "audio2/"+audio_files[_value_item] | |
try: | |
print(audio_files[_value_item], model_voice_path) | |
except: | |
pass | |
info_, (sample_, audio_output_) = vc.vc_single_dont_save( | |
sid=0, | |
input_audio_path1=filename, # f"audio2/{filename}", | |
f0_up_key=transpose, # transpose for m to f and reverse 0 12 | |
f0_file=None, | |
f0_method=f0method, | |
file_index=file_index, # dir pwd? | |
file_index2=file_index2, | |
# file_big_npy1, | |
index_rate=index_rate_, | |
filter_radius=int(3), | |
resample_sr=int(0), | |
rms_mix_rate=float(0.25), | |
protect=float(0.33), | |
crepe_hop_length=crepe_hop_length_, | |
f0_autotune=f0_autotune, | |
f0_min=50, | |
note_min=50, | |
f0_max=1100, | |
note_max=1100, | |
) | |
sf.write( | |
file=filename, # f"audio2/{filename}", | |
samplerate=sample_, | |
data=audio_output_, | |
) | |
def cast_to_device(tensor, device): | |
try: | |
return tensor.to(device) | |
except Exception as e: | |
print(e) | |
return tensor | |
def __bark__(text, voice_preset): | |
os.makedirs(os.path.join(now_dir, "tts"), exist_ok=True) | |
from transformers import AutoProcessor, BarkModel | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
dtype = torch.float32 if "cpu" in device else torch.float16 | |
bark_processor = AutoProcessor.from_pretrained( | |
"suno/bark", | |
cache_dir=os.path.join(now_dir, "tts", "suno/bark"), | |
torch_dtype=dtype, | |
) | |
bark_model = BarkModel.from_pretrained( | |
"suno/bark", | |
cache_dir=os.path.join(now_dir, "tts", "suno/bark"), | |
torch_dtype=dtype, | |
).to(device) | |
# bark_model.enable_cpu_offload() | |
inputs = bark_processor(text=[text], return_tensors="pt", voice_preset=voice_preset) | |
tensor_dict = { | |
k: cast_to_device(v, device) if hasattr(v, "to") else v | |
for k, v in inputs.items() | |
} | |
speech_values = bark_model.generate(**tensor_dict, do_sample=True) | |
sampling_rate = bark_model.generation_config.sample_rate | |
speech = speech_values.cpu().numpy().squeeze() | |
return speech, sampling_rate | |
def use_tts( | |
tts_text, | |
tts_voice, | |
model_path, | |
index_path, | |
transpose, | |
f0_method, | |
index_rate, | |
crepe_hop_length, | |
f0_autotune, | |
tts_method, | |
): | |
if tts_voice == None: | |
return | |
output_folder = "assets/audios/audio-outputs" | |
os.makedirs(output_folder, exist_ok=True) | |
output_count = 1 # Contador para nombres de archivo únicos | |
while True: | |
converted_tts_filename = os.path.join(output_folder, f"tts_out_{output_count}.wav") | |
bark_out_filename = os.path.join(output_folder, f"bark_out_{output_count}.wav") | |
if not os.path.exists(converted_tts_filename) and not os.path.exists(bark_out_filename): | |
break | |
output_count += 1 | |
if "SET_LIMIT" == os.getenv("DEMO"): | |
if len(tts_text) > 60: | |
tts_text = tts_text[:60] | |
print("DEMO; limit to 60 characters") | |
language = tts_voice[:2] | |
if tts_method == "Edge-tts": | |
try: | |
# nest_asyncio.apply() # gradio;not | |
asyncio.run( | |
edge_tts.Communicate( | |
tts_text, "-".join(tts_voice.split("-")[:-1]) | |
).save(converted_tts_filename) | |
) | |
except: | |
try: | |
tts = gTTS(tts_text, lang=language) | |
tts.save(converted_tts_filename) | |
tts.save | |
print( | |
f"No audio was received. Please change the tts voice for {tts_voice}. USING gTTS." | |
) | |
except: | |
tts = gTTS("a", lang=language) | |
tts.save(converted_tts_filename) | |
print("Error: Audio will be replaced.") | |
try: | |
vc.get_vc(model_path) | |
info_, (sample_, audio_output_) = vc.vc_single_dont_save( | |
sid=0, | |
input_audio_path1=converted_tts_filename, | |
f0_up_key=transpose, | |
f0_file=None, | |
f0_method=f0_method, | |
file_index="", | |
file_index2=index_path, | |
index_rate=index_rate, | |
filter_radius=int(3), | |
resample_sr=int(0), | |
rms_mix_rate=float(0.25), | |
protect=float(0.33), | |
crepe_hop_length=crepe_hop_length, | |
f0_autotune=f0_autotune, | |
f0_min=50, | |
note_min=50, | |
f0_max=1100, | |
note_max=1100, | |
) | |
# Genera un nombre de archivo único para el archivo procesado por vc.vc_single_dont_save | |
vc_output_filename = os.path.join(output_folder, f"converted_tts_{output_count}.wav") | |
# Guarda el archivo de audio procesado por vc.vc_single_dont_save | |
wavfile.write( | |
vc_output_filename, | |
rate=sample_, | |
data=audio_output_, | |
) | |
return vc_output_filename,converted_tts_filename | |
except Exception as e: | |
print(f"{e}") | |
return None, None | |
elif tts_method == "Bark-tts": | |
try: | |
script = tts_text.replace("\n", " ").strip() | |
sentences = sent_tokenize(script) | |
print(sentences) | |
silence = np.zeros(int(0.25 * SAMPLE_RATE)) | |
pieces = [] | |
for sentence in sentences: | |
audio_array, _ = __bark__(sentence, tts_voice.split("-")[0]) | |
pieces += [audio_array, silence.copy()] | |
sf.write( | |
file=bark_out_filename, samplerate=SAMPLE_RATE, data=np.concatenate(pieces) | |
) | |
vc.get_vc(model_path) | |
info_, (sample_, audio_output_) = vc.vc_single_dont_save( | |
sid=0, | |
input_audio_path1=os.path.join( | |
now_dir, "assets", "audios", "audio-outputs", "bark_out.wav" | |
), # f"audio2/{filename}", | |
f0_up_key=transpose, # transpose for m to f and reverse 0 12 | |
f0_file=None, | |
f0_method=f0_method, | |
file_index="", # dir pwd? | |
file_index2=index_path, | |
# file_big_npy1, | |
index_rate=index_rate, | |
filter_radius=int(3), | |
resample_sr=int(0), | |
rms_mix_rate=float(0.25), | |
protect=float(0.33), | |
crepe_hop_length=crepe_hop_length, | |
f0_autotune=f0_autotune, | |
f0_min=50, | |
note_min=50, | |
f0_max=1100, | |
note_max=1100, | |
) | |
vc_output_filename = os.path.join(output_folder, f"converted_bark_{output_count}.wav") | |
# Guarda el archivo de audio procesado por vc.vc_single_dont_save | |
wavfile.write( | |
vc_output_filename, | |
rate=sample_, | |
data=audio_output_, | |
) | |
return vc_output_filename, bark_out_filename | |
except Exception as e: | |
print(f"{e}") | |
return None, None | |