File size: 7,594 Bytes
9e1a4da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import torch
import librosa
import commons
import utils
from models import SynthesizerTrn
from text import text_to_sequence
import numpy as np
from mel_processing import spectrogram_torch
import gradio as gr
from text.cleaners import shanghainese_cleaners
from transformers import AutoModel, AutoTokenizer
from TTS.api import TTS
tts = TTS("tts_models/zh-CN/baker/tacotron2-DDC-GST")
tts1 = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement
enhance_model = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device":"cuda"},
)
from denoiser import pretrained
from denoiser.dsp import convert_audio
model1 = pretrained.dns64().cuda()
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
model = model.eval()
def predict(input, history=None):
if history is None:
history = []
response, history = model.chat(tokenizer, input, history)
return history, history, response
def chinese(text_cn, upload1, VoiceMicrophone1):
if upload1 is not None:
tts.tts_with_vc_to_file(
" ".join(text_cn.split()) + "。",
speaker_wav=upload1,
file_path="output0.wav"
)
else:
tts.tts_with_vc_to_file(
" ".join(text_cn.split()) + "。",
speaker_wav=VoiceMicrophone1,
file_path="output0.wav"
)
noisy = enhance_model.load_audio(
"output0.wav"
).unsqueeze(0)
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
return "enhanced.wav"
def english(text_en, upload, VoiceMicrophone):
if upload is not None:
tts1.tts_to_file(text_en.strip(), speaker_wav = upload, language="en", file_path="output.wav")
else:
tts1.tts_to_file(text_en.strip(), speaker_wav = VoiceMicrophone, language="en", file_path="output.wav")
wav, sr = torchaudio.load("output.wav")
wav = convert_audio(wav.cuda(), sr, model1.sample_rate, model1.chin)
with torch.no_grad():
denoised = model1(wav[None])[0]
torchaudio.save("denoise.wav", denoised.data.cpu(), model1.sample_rate)
noisy = enhance_model.load_audio(
"denoise.wav"
).unsqueeze(0)
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)
return "enhanced.wav"
def clean_text(text,ipa_input):
if ipa_input:
return shanghainese_cleaners(text)
return text
def get_text(text, hps, cleaned=False):
if cleaned:
text_norm = text_to_sequence(text, hps.symbols, [])
else:
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def speech_synthesize(text, cleaned, length_scale):
text=text.replace('\n','')
print(text)
stn_tst = get_text(text, hps_ms, cleaned)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([0])
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
return (hps_ms.data.sampling_rate, audio)
hps_ms = utils.get_hparams_from_file('model/config.json')
n_speakers = hps_ms.data.n_speakers
n_symbols = len(hps_ms.symbols)
speakers = hps_ms.speakers
net_g_ms = SynthesizerTrn(
n_symbols,
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps_ms.data.hop_length,
n_speakers=n_speakers,
**hps_ms.model)
_ = net_g_ms.eval()
utils.load_checkpoint('model/model.pth', net_g_ms)
with gr.Blocks() as demo:
gr.Markdown(
""" # <center>🥳💬💕 - TalktoAI,随时随地,谈天说地!</center>
### <center>🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!</center>
"""
)
state = gr.State([])
chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300)
res = gr.Textbox(lines=1, placeholder="最新的回答在这里(此内容可编辑,用作声音克隆的文本)", show_label = False).style(container=False)
with gr.Row():
txt = gr.Textbox(label = "说点什么吧(中英皆可)", lines=1)
button = gr.Button("开始对话吧")
txt.submit(predict, [txt, state], [chatbot, state, res])
button.click(predict, [txt, state], [chatbot, state, res])
with gr.Row().style(mobile_collapse=False, equal_height=True):
inp3 = res
inp4 = gr.Audio(source="upload", label = "请上传您喜欢的声音(wav/mp3文件);长语音(~90s)、女声效果更好", type="filepath")
inp5 = gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音,与文件上传二选一即可')
btn1 = gr.Button("用喜欢的声音听一听吧(中文)")
btn2 = gr.Button("用喜欢的声音听一听吧(英文)")
with gr.Row():
out1 = gr.Audio(label="为您合成的专属声音(中文)")
out2 = gr.Audio(label="为您合成的专属声音(英文)")
btn1.click(chinese, [inp3, inp4, inp5], [out1])
btn2.click(english, [inp3, inp4, inp5], [out2])
text_input = res
cleaned_text=gr.Checkbox(label='IPA Input',default=True, visible = False)
length_scale=gr.Slider(0.5,2,1,step=0.1,label='Speaking Speed',interactive=True, visible = False)
with gr.Row().style(mobile_collapse=False, equal_height=True):
tts_button = gr.Button('彩蛋:上海话合成')
audio_output = gr.Audio(label='听一听上海话吧')
cleaned_text.change(clean_text,[text_input,cleaned_text],[text_input])
tts_button.click(speech_synthesize,[text_input,cleaned_text,length_scale],[audio_output])
gr.Markdown(
""" ### <center>注意❗:请不要输入或生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及娱乐使用。用户输入或生成的内容与程序开发者无关,请自觉合法合规使用,违反者一切后果自负。</center>
### <center>Model by [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b). Thanks to [THUDM](https://github.com/THUDM) and [CjangCjengh](https://github.com/CjangCjengh). Please follow me on [Bilibili](https://space.bilibili.com/501495851?spm_id_from=333.1007.0.0).</center>
"""
)
gr.HTML('''
<div class="footer">
<p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing. - Steve Jobs
</p>
<p>注:中文声音克隆实际上是通过声音转换(Voice Conversion)实现,所以输出结果可能更像是一种新的声音,效果不一定很理想,希望大家多多包涵,之后我们也会不断迭代该程序的!为了实现更好的效果,使用中文声音克隆时请尽量上传女声。
</p>
</div>
''')
demo.queue().launch(show_error=True) |