File size: 7,594 Bytes
9e1a4da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import torch
import librosa
import commons
import utils
from models import SynthesizerTrn
from text import text_to_sequence
import numpy as np
from mel_processing import spectrogram_torch
import gradio as gr
from text.cleaners import shanghainese_cleaners

from transformers import AutoModel, AutoTokenizer
from TTS.api import TTS

tts = TTS("tts_models/zh-CN/baker/tacotron2-DDC-GST")

tts1 = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)

import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement

enhance_model = SpectralMaskEnhancement.from_hparams(
source="speechbrain/metricgan-plus-voicebank",
savedir="pretrained_models/metricgan-plus-voicebank",
run_opts={"device":"cuda"},
)

from denoiser import pretrained
from denoiser.dsp import convert_audio

model1 = pretrained.dns64().cuda()

tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
model = model.eval()

def predict(input, history=None):
    if history is None:
        history = []
    response, history = model.chat(tokenizer, input, history)

    return history, history, response

def chinese(text_cn, upload1, VoiceMicrophone1):
    
    if upload1 is not None:
        
        tts.tts_with_vc_to_file(
            " ".join(text_cn.split()) + "。",
            speaker_wav=upload1,
            file_path="output0.wav"
        )
              
    else:
        tts.tts_with_vc_to_file(
            " ".join(text_cn.split()) + "。",
            speaker_wav=VoiceMicrophone1,
            file_path="output0.wav"
        )

    noisy = enhance_model.load_audio(
    "output0.wav"
    ).unsqueeze(0)

    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

    return "enhanced.wav"

def english(text_en, upload, VoiceMicrophone):
    if upload is not None:
        tts1.tts_to_file(text_en.strip(), speaker_wav = upload, language="en", file_path="output.wav")

    else:
        tts1.tts_to_file(text_en.strip(), speaker_wav = VoiceMicrophone, language="en", file_path="output.wav")

    wav, sr = torchaudio.load("output.wav")
    wav = convert_audio(wav.cuda(), sr, model1.sample_rate, model1.chin)
    with torch.no_grad():
        denoised = model1(wav[None])[0]

    torchaudio.save("denoise.wav", denoised.data.cpu(), model1.sample_rate)
        
    noisy = enhance_model.load_audio(
    "denoise.wav"
    ).unsqueeze(0)

    enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))
    torchaudio.save("enhanced.wav", enhanced.cpu(), 16000)

    return "enhanced.wav"

def clean_text(text,ipa_input):
    if ipa_input:
        return shanghainese_cleaners(text)
    return text


def get_text(text, hps, cleaned=False):
    if cleaned:
        text_norm = text_to_sequence(text, hps.symbols, [])
    else:
        text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm


def speech_synthesize(text, cleaned, length_scale):
    text=text.replace('\n','')
    print(text)
    stn_tst = get_text(text, hps_ms, cleaned)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([0])
        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
    return (hps_ms.data.sampling_rate, audio)



hps_ms = utils.get_hparams_from_file('model/config.json')
n_speakers = hps_ms.data.n_speakers
n_symbols = len(hps_ms.symbols)
speakers = hps_ms.speakers

net_g_ms = SynthesizerTrn(
    n_symbols,
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps_ms.data.hop_length,
    n_speakers=n_speakers,
    **hps_ms.model)
_ = net_g_ms.eval()
utils.load_checkpoint('model/model.pth', net_g_ms)

with gr.Blocks() as demo:
    gr.Markdown(
            """ # <center>🥳💬💕 - TalktoAI,随时随地,谈天说地!</center>
            
            ### <center>🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!</center>
            
        """
    )
    state = gr.State([])
    chatbot = gr.Chatbot([], elem_id="chatbot").style(height=300)
    res = gr.Textbox(lines=1, placeholder="最新的回答在这里(此内容可编辑,用作声音克隆的文本)", show_label = False).style(container=False)
    with gr.Row():
        txt = gr.Textbox(label = "说点什么吧(中英皆可)", lines=1)
        button = gr.Button("开始对话吧")
    txt.submit(predict, [txt, state], [chatbot, state, res])
    button.click(predict, [txt, state], [chatbot, state, res])
    
    with gr.Row().style(mobile_collapse=False, equal_height=True):
        inp3 = res
        inp4 = gr.Audio(source="upload", label = "请上传您喜欢的声音(wav/mp3文件);长语音(~90s)、女声效果更好", type="filepath")
        inp5 = gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音,与文件上传二选一即可')
        btn1 = gr.Button("用喜欢的声音听一听吧(中文)")

        btn2 = gr.Button("用喜欢的声音听一听吧(英文)")
    with gr.Row():
        out1 = gr.Audio(label="为您合成的专属声音(中文)")
        out2 = gr.Audio(label="为您合成的专属声音(英文)")
    btn1.click(chinese, [inp3, inp4, inp5], [out1])
    btn2.click(english, [inp3, inp4, inp5], [out2])

    text_input = res
    cleaned_text=gr.Checkbox(label='IPA Input',default=True, visible = False)
    length_scale=gr.Slider(0.5,2,1,step=0.1,label='Speaking Speed',interactive=True, visible = False)
    with gr.Row().style(mobile_collapse=False, equal_height=True):
        tts_button = gr.Button('彩蛋:上海话合成')
        audio_output = gr.Audio(label='听一听上海话吧')
    cleaned_text.change(clean_text,[text_input,cleaned_text],[text_input])
    tts_button.click(speech_synthesize,[text_input,cleaned_text,length_scale],[audio_output])

    gr.Markdown(
            """ ### <center>注意❗:请不要输入或生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及娱乐使用。用户输入或生成的内容与程序开发者无关,请自觉合法合规使用,违反者一切后果自负。</center>
            
            ### <center>Model by [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b). Thanks to [THUDM](https://github.com/THUDM) and [CjangCjengh](https://github.com/CjangCjengh). Please follow me on [Bilibili](https://space.bilibili.com/501495851?spm_id_from=333.1007.0.0).</center>
            
        """
    )
        
    gr.HTML('''
        <div class="footer">
                    <p>🎶🖼️🎡 - It’s the intersection of technology and liberal arts that makes our hearts sing. - Steve Jobs
                    </p>
                    <p>注:中文声音克隆实际上是通过声音转换(Voice Conversion)实现,所以输出结果可能更像是一种新的声音,效果不一定很理想,希望大家多多包涵,之后我们也会不断迭代该程序的!为了实现更好的效果,使用中文声音克隆时请尽量上传女声。
                    </p>
        </div>
    ''')     

demo.queue().launch(show_error=True)