File size: 5,346 Bytes
572ba7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eafbbd9
572ba7e
 
 
 
 
eafbbd9
 
572ba7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2eada3
 
572ba7e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import gradio as gr
import random

os.system("pip install --upgrade Cython==0.29.35")
os.system("pip install pysptk --no-build-isolation")
os.system("pip install kantts -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html")
os.system("pip install librosa==0.9.2")
os.system("pip install numpy==1.22.0")

from modelscope.models.audio.tts import SambertHifigan
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

from voicefixer import VoiceFixer
voicefixer = VoiceFixer()

# model_0

model_dir = os.path.abspath("./pretrain_work_dir")

custom_infer_abs = {
    'voice_name':
    'F7',
    'am_ckpt':
    os.path.join(model_dir, 'tmp_am', 'ckpt'),
    'am_config':
    os.path.join(model_dir, 'tmp_am', 'config.yaml'),
    'voc_ckpt':
    os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
    'voc_config':
    os.path.join(model_dir, 'orig_model', 'basemodel_16k', 'hifigan',
             'config.yaml'),
    'audio_config':
    os.path.join(model_dir, 'data', 'audio_config.yaml'),
    'se_file':
    os.path.join(model_dir, 'data', 'se', 'se.npy')
}
kwargs = {'custom_ckpt': custom_infer_abs}

model_id = SambertHifigan(os.path.join(model_dir, "orig_model"), **kwargs)

inference = pipeline(task=Tasks.text_to_speech, model=model_id)

# model_1

model_dir1 = os.path.abspath("./jay/pretrain_work_dir")

custom_infer_abs1 = {
    'voice_name':
    'F7',
    'am_ckpt':
    os.path.join(model_dir1, 'tmp_am', 'ckpt'),
    'am_config':
    os.path.join(model_dir1, 'tmp_am', 'config.yaml'),
    'voc_ckpt':
    os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan', 'ckpt'),
    'voc_config':
    os.path.join(model_dir1, 'orig_model', 'basemodel_16k', 'hifigan',
             'config.yaml'),
    'audio_config':
    os.path.join(model_dir1, 'data', 'audio_config.yaml'),
    'se_file':
    os.path.join(model_dir1, 'data', 'se', 'se.npy')
}
kwargs1 = {'custom_ckpt': custom_infer_abs1}

model_id1 = SambertHifigan(os.path.join(model_dir1, "orig_model"), **kwargs1)

inference1 = pipeline(task=Tasks.text_to_speech, model=model_id1)


# functions

def infer(text):
    output = inference(input=text)
    filename = str(random.randint(1, 1000000000000))
    
    with open(filename + "myfile.wav", mode='bx') as f:
        f.write(output["output_wav"])
    return filename + "myfile.wav"

def infer1(text):
    output = inference1(input=text)
    filename = str(random.randint(1, 1000000000000))
    
    with open(filename + "file.wav", mode='bx') as f:
        f.write(output["output_wav"])
    return filename + "file.wav"

# upsample

import numpy as np
import torch
from hifi_gan_bwe import BandwidthExtender
from scipy.io.wavfile import write

MAX_LENGTH = 600.0

model = BandwidthExtender.from_pretrained("hifi-gan-bwe-10-42890e3-vctk-48kHz")

def extend(audio):
    fs, x = audio
    x = x[:int(MAX_LENGTH * fs)]
    x = x.astype(np.float32) / 32767.0
    if len(x.shape) == 1:
        x = x[:, np.newaxis]

    with torch.no_grad():
        y = np.stack([model(torch.from_numpy(x), fs) for x in x.T]).T
        y = (y * 32767.0).astype(np.int16)
        fs = int(model.sample_rate)
        write("upsample.wav", fs, y)

    return "upsample.wav"

# denoise

def inference_denoise(audio):
    voicefixer.restore(input=audio, # input wav file path
                    output="output.wav", # output wav file path
                    cuda=False, # whether to use gpu acceleration
                    mode = int(0)) # You can try out mode 0, 1 to find out the best result
    return 'output.wav'


app = gr.Blocks()

with app:
    gr.Markdown("# <center>🥳🎶🎡 - KanTTS中文声音克隆</center>")
    gr.Markdown("## <center>🌊 - 更多精彩应用,敬请关注[xm火种堂](http://xmaigc.top);社恐患者杨老师💕</center>")

    with gr.Row():
        with gr.Column():
            inp = gr.Textbox(lines=5, label="请填写您想要转换的中文文本")
            with gr.Row():
                btn = gr.Button("使用定制AI钟老师的声音", variant="primary")
                btn1 = gr.Button("使用AI周杰伦的声音", variant="primary")
        with gr.Column():
            with gr.Row():
                out = gr.Audio(label="为您生成的专属音频")
                out1 = gr.Audio(label="更高采样率的专属音频", type="filepath")
                out2 = gr.Audio(label="降噪后的高采样率音频", type="filepath")
            with gr.Row():
                btn2 = gr.Button("一键提高采样率")
                btn3 = gr.Button("一键降噪")
    
        btn.click(fn=infer, inputs=[inp], outputs=[out])
        btn1.click(fn=infer1, inputs=[inp], outputs=[out])
        btn2.click(fn=extend, inputs=[out], outputs=[out1])
        btn3.click(fn=inference_denoise, inputs=[out1], outputs=[out2])

    gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>")
    gr.HTML('''
        <div class="footer">
                    <p>🌊🏞️🎶 - 雪夜围炉煎暖茶,泊船瓜洲宿酒家。 
                               - 此生不关风与月,执手仗剑闯天涯。 社恐患者杨老师
                    </p>
        </div>
    ''')
app.launch(show_error=True)