Spaces:
Build error
Build error
File size: 4,048 Bytes
aa21111 cd69483 aa21111 cd69483 aa21111 cd69483 aa21111 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import gradio as gr
import torch
import numpy as np
import sys
from vinorm import TTSnorm
from utils_audio import convert_to_wav
sys.path.append("vits")
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write
import logging
numba_logger = logging.getLogger("numba")
numba_logger.setLevel(logging.WARNING)
from resemblyzer import preprocess_wav, VoiceEncoder
device = "cpu"
def get_text(texts, hps):
text_norm_list = []
for text in texts.split(","):
chunk_strings = []
chunk_len = 30
for i in range(0, len(text.split()), chunk_len):
chunk = " ".join(text.split()[i : i + chunk_len])
chunk_strings.append(chunk)
for chunk_string in chunk_strings:
text_norm = text_to_sequence(chunk_string, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm_list.append(torch.LongTensor(text_norm))
return text_norm_list
def get_speaker_embedding(path):
encoder = VoiceEncoder(device="cpu")
path = convert_to_wav(path)
wav = preprocess_wav(path)
embed = encoder.embed_utterance(wav)
return embed
class VoiceClone:
def __init__(self, checkpoint_path):
hps = utils.get_hparams_from_file("vivos.json")
self.net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
).to(device)
_ = self.net_g.eval()
_ = utils.load_checkpoint(checkpoint_path, self.net_g, None)
self.hps = hps
def infer(self, text, ref_audio):
text_norm = TTSnorm(text)
stn_tst_list = get_text(text_norm, self.hps)
with torch.no_grad():
audios = []
for stn_tst in stn_tst_list:
x_tst = stn_tst.to(device).unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
speaker_embedding = get_speaker_embedding(ref_audio)
speaker_embedding = (
torch.FloatTensor(torch.from_numpy(speaker_embedding))
.unsqueeze(0)
.to(device)
)
audio = self.net_g.infer(
x_tst,
x_tst_lengths,
speaker_embedding=speaker_embedding,
noise_scale=0.667,
noise_scale_w=0.8,
length_scale=1,
)
audio = audio[0][0, 0].data.cpu().float().numpy()
audios.append(audio)
print(audio.shape)
audios = np.concatenate(audios, axis=0)
write(ref_audio.replace(".wav", "_clone.wav"), 22050, audios)
return ref_audio.replace(".wav", "_clone.wav"), text_norm
object = VoiceClone("G_150000.pth")
def clonevoice(text: str, speaker_wav, file_upload, language: str):
speaker_source = ""
if speaker_wav is not None:
speaker_source = speaker_wav
elif file_upload is not None:
speaker_source = file_upload
else:
speaker_source = "vsontung.wav"
print(speaker_source)
outfile, text_norm = object.infer(text, speaker_source)
return [outfile, text_norm]
inputs = [
gr.Textbox(
label="Input",
value="muốn ngồi ở một vị trí không ai ngồi được thì phải chịu cảm giác không ai chịu được",
max_lines=3,
),
gr.Audio(label="Speaker Wav", source="microphone", type="filepath"),
gr.Audio(label="Speaker Wav", source="upload", type="filepath"),
gr.Radio(label="Language", choices=["Vietnamese"], value="en"),
]
outputs = [gr.Audio(label="Output"), gr.TextArea()]
demo = gr.Interface(fn=clonevoice, inputs=inputs, outputs=outputs)
demo.launch(debug=True)
|