Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,6 @@ import re
|
|
5 |
import tempfile
|
6 |
import logging
|
7 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
8 |
-
import ONNXVITS_infer
|
9 |
import librosa
|
10 |
import numpy as np
|
11 |
import torch
|
@@ -15,14 +14,12 @@ import utils
|
|
15 |
import gradio as gr
|
16 |
import gradio.utils as gr_utils
|
17 |
import gradio.processing_utils as gr_processing_utils
|
18 |
-
from
|
19 |
from text import text_to_sequence, _clean_text
|
20 |
from text.symbols import symbols
|
21 |
from mel_processing import spectrogram_torch
|
22 |
-
import translators.server as tss
|
23 |
import psutil
|
24 |
from datetime import datetime
|
25 |
-
from text.cleaners import japanese_cleaners
|
26 |
|
27 |
def audio_postprocess(self, y):
|
28 |
if y is None:
|
@@ -42,53 +39,71 @@ def audio_postprocess(self, y):
|
|
42 |
return gr_processing_utils.encode_url_or_file_to_base64(file.name)
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
gr.Audio.postprocess = audio_postprocess
|
46 |
|
47 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
def show_memory_info(hint):
|
72 |
-
pid = os.getpid()
|
73 |
-
p = psutil.Process(pid)
|
74 |
-
info = p.memory_info()
|
75 |
-
memory = info.rss / 1024.0 / 1024
|
76 |
-
print("{} 内存占用: {} MB".format(hint, memory))
|
77 |
|
78 |
-
def
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
def get_text(text, hps, is_symbol):
|
94 |
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
@@ -97,120 +112,12 @@ def get_text(text, hps, is_symbol):
|
|
97 |
text_norm = LongTensor(text_norm)
|
98 |
return text_norm
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
hps.data.filter_length // 2 + 1,
|
105 |
-
hps.train.segment_size // hps.data.hop_length,
|
106 |
-
n_speakers=hps.data.n_speakers,
|
107 |
-
**hps.model)
|
108 |
-
_ = net_g.eval()
|
109 |
-
|
110 |
-
_ = utils.load_checkpoint("pretrained_models/G_1153000.pth", net_g)
|
111 |
|
112 |
-
|
113 |
-
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
|
114 |
-
else (temp_text, temp_text)
|
115 |
-
|
116 |
-
def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, is_symbol):
|
117 |
-
# check character & duraction parameter
|
118 |
-
if language not in languages:
|
119 |
-
print("Error: No such language\n")
|
120 |
-
return "Error: No such language", None, None, None
|
121 |
-
if character not in characters:
|
122 |
-
print("Error: No such character\n")
|
123 |
-
return "Error: No such character", None, None, None
|
124 |
-
# check text length
|
125 |
-
if limitation:
|
126 |
-
text_len = len(text_raw) if is_symbol else len(re.sub("\[([A-Z]{2})\]", "", text_raw))
|
127 |
-
max_len = 150
|
128 |
-
if is_symbol:
|
129 |
-
max_len *= 3
|
130 |
-
if text_len > max_len:
|
131 |
-
print(f"Refused: Text too long ({text_len}).")
|
132 |
-
return "Error: Text is too long", None, None, None
|
133 |
-
if text_len == 0:
|
134 |
-
print("Refused: Text length is zero.")
|
135 |
-
return "Error: Please input text!", None, None, None
|
136 |
-
if is_symbol:
|
137 |
-
text = text_raw
|
138 |
-
elif language == '日本語':
|
139 |
-
text = text_raw
|
140 |
-
elif language == '简体中文':
|
141 |
-
text = tss.google(text_raw, from_language='zh', to_language='ja')
|
142 |
-
elif language == 'English':
|
143 |
-
text = tss.google(text_raw, from_language='en', to_language='ja')
|
144 |
-
char_id = int(character.split(':')[0])
|
145 |
-
stn_tst = get_text(text, hps, is_symbol)
|
146 |
-
with torch.no_grad():
|
147 |
-
x_tst = stn_tst.unsqueeze(0)
|
148 |
-
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
149 |
-
sid = torch.LongTensor([char_id])
|
150 |
-
try:
|
151 |
-
jp2phoneme = text_to_phoneme(text, hps.symbols, is_symbol)
|
152 |
-
durations = net_g.predict_duration(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
153 |
-
noise_scale_w=noise_scale_w, length_scale=duration)
|
154 |
-
char_dur_list = []
|
155 |
-
for i, char in enumerate(jp2phoneme):
|
156 |
-
char_pos = i * 2 + 1
|
157 |
-
char_dur = durations[char_pos]
|
158 |
-
char_dur_list.append(char_dur)
|
159 |
-
except IndexError:
|
160 |
-
print("Refused: Phoneme input contains non-phoneme character.")
|
161 |
-
return "Error: You can only input phoneme under phoneme input model", None, None, None
|
162 |
-
char_spacing_dur_list = []
|
163 |
-
char_spacings = []
|
164 |
-
for i in range(len(durations)):
|
165 |
-
if i % 2 == 0: # spacing
|
166 |
-
char_spacings.append("spacing")
|
167 |
-
elif i % 2 == 1: # char
|
168 |
-
char_spacings.append(jp2phoneme[int((i - 1) / 2)])
|
169 |
-
char_spacing_dur_list.append(int(durations[i]))
|
170 |
-
# convert duration information to string
|
171 |
-
duration_info_str = ""
|
172 |
-
for i in range(len(char_spacings)):
|
173 |
-
if i == len(char_spacings) - 1:
|
174 |
-
duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")"
|
175 |
-
elif char_spacings[i] == "spacing":
|
176 |
-
duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")" + ", "
|
177 |
-
else:
|
178 |
-
duration_info_str += char_spacings[i] + ":" + str(char_spacing_dur_list[i])
|
179 |
-
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
|
180 |
-
currentDateAndTime = datetime.now()
|
181 |
-
print(f"\nCharacter {character} inference successful: {text}")
|
182 |
-
if language != '日本語':
|
183 |
-
print(f"translate from {language}: {text_raw}")
|
184 |
-
show_memory_info(str(currentDateAndTime) + " infer调用后")
|
185 |
-
return (text,(22050, audio), jp2phoneme, duration_info_str)
|
186 |
-
|
187 |
-
def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale, noise_scale_w):
|
188 |
-
try:
|
189 |
-
phonemes = duration_info_str.split(", ")
|
190 |
-
recons_durs = []
|
191 |
-
recons_phonemes = ""
|
192 |
-
for i, item in enumerate(phonemes):
|
193 |
-
if i == 0:
|
194 |
-
recons_durs.append(int(item.strip("()")))
|
195 |
-
else:
|
196 |
-
phoneme_n_dur, spacing_dur = item.split("(")
|
197 |
-
recons_phonemes += phoneme_n_dur.split(":")[0]
|
198 |
-
recons_durs.append(int(phoneme_n_dur.split(":")[1]))
|
199 |
-
recons_durs.append(int(spacing_dur.strip(")")))
|
200 |
-
except ValueError:
|
201 |
-
return ("Error: Format must not be changed!", None)
|
202 |
-
except AssertionError:
|
203 |
-
return ("Error: Format must not be changed!", None)
|
204 |
-
char_id = int(character.split(':')[0])
|
205 |
-
stn_tst = get_text(recons_phonemes, hps, is_symbol=True)
|
206 |
-
with torch.no_grad():
|
207 |
-
x_tst = stn_tst.unsqueeze(0)
|
208 |
-
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
209 |
-
sid = torch.LongTensor([char_id])
|
210 |
-
audio = net_g.infer_with_duration(x_tst, x_tst_lengths, w_ceil=recons_durs, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
|
211 |
-
length_scale=duration)[0][0, 0].data.cpu().float().numpy()
|
212 |
-
print(f"\nCharacter {character} inference successful: {recons_phonemes}, from {duration_info_str}")
|
213 |
-
return (recons_phonemes, (22050, audio))
|
214 |
|
215 |
download_audio_js = """
|
216 |
() =>{{
|
@@ -230,134 +137,132 @@ download_audio_js = """
|
|
230 |
}}
|
231 |
"""
|
232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
if __name__ == "__main__":
|
234 |
parser = argparse.ArgumentParser()
|
235 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
236 |
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
app = gr.Blocks()
|
238 |
with app:
|
239 |
-
gr.Markdown("#
|
240 |
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
|
241 |
-
"
|
242 |
-
"这个合成器是基于VITS文本到语音模型,在从手游《賽馬娘:Pretty Derby》解包的语音数据上训练得到。[Dataset Link](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
|
243 |
-
"[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
|
244 |
-
"You may duplicate this space or [open in Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing) to run it privately and without any queue.\n\n"
|
245 |
-
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
246 |
"If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
|
247 |
"若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
|
248 |
-
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
249 |
-
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
250 |
)
|
251 |
-
with gr.
|
252 |
-
with gr.
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
}}""")
|
288 |
-
# select character
|
289 |
-
char_dropdown = gr.Dropdown(choices=characters, value = "0:特别周", label='character')
|
290 |
-
language_dropdown = gr.Dropdown(choices=languages, value = "日本語", label='language')
|
291 |
-
|
292 |
-
|
293 |
-
duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1, label='时长 Duration')
|
294 |
-
noise_scale_slider = gr.Slider(minimum=0.1, maximum=5, value=0.667, step=0.001, label='噪声比例 noise_scale')
|
295 |
-
noise_scale_w_slider = gr.Slider(minimum=0.1, maximum=5, value=0.8, step=0.1, label='噪声偏差 noise_scale_w')
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
with gr.Column():
|
300 |
-
text_output = gr.Textbox(label="Output Text")
|
301 |
-
phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
|
302 |
-
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
303 |
-
btn = gr.Button("Generate!")
|
304 |
-
cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
|
305 |
-
|
306 |
-
download = gr.Button("Download Audio")
|
307 |
-
download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
|
308 |
-
with gr.Accordion(label="Speaking Pace Control", open=True):
|
309 |
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
)
|
339 |
-
gr.Markdown("# Updates Logs 更新日志:\n\n"
|
340 |
-
"2023/1/24:\n\n"
|
341 |
-
"Improved the format of phoneme length control.\n\n"
|
342 |
-
"改善了音素控制的格式。\n\n"
|
343 |
-
"2023/1/24:\n\n"
|
344 |
-
"Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
|
345 |
-
"增加了对说话节奏的音素级控制。\n\n"
|
346 |
-
"2023/1/13:\n\n"
|
347 |
-
"Added one example of phoneme input.\n\n"
|
348 |
-
"增加了音素输入的example(米浴喘气)\n\n"
|
349 |
-
"2023/1/12:\n\n"
|
350 |
-
"Added phoneme input, which enables more precise control on output audio.\n\n"
|
351 |
-
"增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
|
352 |
-
"Adjusted UI arrangements.\n\n"
|
353 |
-
"调整了UI的布局。\n\n"
|
354 |
-
"2023/1/10:\n\n"
|
355 |
-
"Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
|
356 |
-
"数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
|
357 |
-
"2023/1/9:\n\n"
|
358 |
-
"Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
|
359 |
-
"模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
|
360 |
-
"Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
|
361 |
-
"现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
|
362 |
-
)
|
363 |
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|
|
|
5 |
import tempfile
|
6 |
import logging
|
7 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
|
|
8 |
import librosa
|
9 |
import numpy as np
|
10 |
import torch
|
|
|
14 |
import gradio as gr
|
15 |
import gradio.utils as gr_utils
|
16 |
import gradio.processing_utils as gr_processing_utils
|
17 |
+
from ONNXVITS_infer import SynthesizerTrn
|
18 |
from text import text_to_sequence, _clean_text
|
19 |
from text.symbols import symbols
|
20 |
from mel_processing import spectrogram_torch
|
|
|
21 |
import psutil
|
22 |
from datetime import datetime
|
|
|
23 |
|
24 |
def audio_postprocess(self, y):
|
25 |
if y is None:
|
|
|
39 |
return gr_processing_utils.encode_url_or_file_to_base64(file.name)
|
40 |
|
41 |
|
42 |
+
language_marks = {
|
43 |
+
"日本語": "[JA]",
|
44 |
+
"简体中文": "[ZH]",
|
45 |
+
"English": "[EN]",
|
46 |
+
"Mix": "",
|
47 |
+
}
|
48 |
+
|
49 |
gr.Audio.postprocess = audio_postprocess
|
50 |
|
51 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
52 |
+
def create_tts_fn(model, hps, speaker_ids):
|
53 |
+
def tts_fn(text, speaker, language, speed, is_symbol):
|
54 |
+
if limitation:
|
55 |
+
text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
|
56 |
+
max_len = 150
|
57 |
+
if is_symbol:
|
58 |
+
max_len *= 3
|
59 |
+
if text_len > max_len:
|
60 |
+
return "Error: Text is too long", None
|
61 |
+
if language is not None:
|
62 |
+
text = language_marks[language] + text + language_marks[language]
|
63 |
+
speaker_id = speaker_ids[speaker]
|
64 |
+
stn_tst = get_text(text, hps, is_symbol)
|
65 |
+
with no_grad():
|
66 |
+
x_tst = stn_tst.unsqueeze(0).to(device)
|
67 |
+
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
|
68 |
+
sid = LongTensor([speaker_id]).to(device)
|
69 |
+
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
70 |
+
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
71 |
+
del stn_tst, x_tst, x_tst_lengths, sid
|
72 |
+
return "Success", (hps.data.sampling_rate, audio)
|
73 |
+
|
74 |
+
return tts_fn
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
def create_vc_fn(model, hps, speaker_ids):
|
77 |
+
def vc_fn(original_speaker, target_speaker, input_audio):
|
78 |
+
if input_audio is None:
|
79 |
+
return "You need to upload an audio", None
|
80 |
+
sampling_rate, audio = input_audio
|
81 |
+
duration = audio.shape[0] / sampling_rate
|
82 |
+
if limitation and duration > 30:
|
83 |
+
return "Error: Audio is too long", None
|
84 |
+
original_speaker_id = speaker_ids[original_speaker]
|
85 |
+
target_speaker_id = speaker_ids[target_speaker]
|
86 |
|
87 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
88 |
+
if len(audio.shape) > 1:
|
89 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
90 |
+
if sampling_rate != hps.data.sampling_rate:
|
91 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
|
92 |
+
with no_grad():
|
93 |
+
y = torch.FloatTensor(audio)
|
94 |
+
y = y.unsqueeze(0)
|
95 |
+
spec = spectrogram_torch(y, hps.data.filter_length,
|
96 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
97 |
+
center=False).to(device)
|
98 |
+
spec_lengths = LongTensor([spec.size(-1)]).to(device)
|
99 |
+
sid_src = LongTensor([original_speaker_id]).to(device)
|
100 |
+
sid_tgt = LongTensor([target_speaker_id]).to(device)
|
101 |
+
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
102 |
+
0, 0].data.cpu().float().numpy()
|
103 |
+
del y, spec, spec_lengths, sid_src, sid_tgt
|
104 |
+
return "Success", (hps.data.sampling_rate, audio)
|
105 |
+
|
106 |
+
return vc_fn
|
107 |
|
108 |
def get_text(text, hps, is_symbol):
|
109 |
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
|
|
112 |
text_norm = LongTensor(text_norm)
|
113 |
return text_norm
|
114 |
|
115 |
+
def create_to_symbol_fn(hps):
|
116 |
+
def to_symbol_fn(is_symbol_input, input_text, temp_text):
|
117 |
+
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
|
118 |
+
else (temp_text, temp_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
return to_symbol_fn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
download_audio_js = """
|
123 |
() =>{{
|
|
|
137 |
}}
|
138 |
"""
|
139 |
|
140 |
+
models_tts = []
|
141 |
+
models_vc = []
|
142 |
+
models_info = [
|
143 |
+
{
|
144 |
+
"title": "Japanese",
|
145 |
+
"langugages": ["日本語"],
|
146 |
+
"description": "",
|
147 |
+
"model_path": "./pretrained_models/G_1153000.pth",
|
148 |
+
"config_path": "./configs/uma87.json"
|
149 |
+
"examples": [['お疲れ様です,トレーナーさん。', 'Silence Suzuka', '日本語', 1, False],
|
150 |
+
['張り切っていこう!', 'Kitasan Black', '日本語', 1, False],
|
151 |
+
['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', 'Grass Wonder', '日本語', 1, False],
|
152 |
+
['授業中に出しだら,学校生活終わるですわ。', 'Mejiro Mcqueen', '日本語', 1, False],
|
153 |
+
['お帰りなさい,お兄様!', 'Rice Shower', '日本語', 1, False],
|
154 |
+
['私の処女をもらっでください!', 'Rice Shower', '日本語', 1, False]]
|
155 |
+
},
|
156 |
+
{
|
157 |
+
"title": "Japanese",
|
158 |
+
"langugages": ['日本語', '简体中文', 'English', 'Mix'],
|
159 |
+
"description": "",
|
160 |
+
"model_path": "./pretrained_models/G_1396000.pth",
|
161 |
+
"config_path": "./configs/uma_trilingual.json"
|
162 |
+
"examples": [['你好,训练员先生,很高兴见到你。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', '简体中文', 1, False],
|
163 |
+
['To be honest, I have no idea what to say as examples.', '派蒙 Paimon (Genshin Impact)', 'English', 1, False],
|
164 |
+
['授業中に出しだら,学校生活終わるですわ。', '綾地 寧々 Ayachi Nene (Sanoba Witch)', '日本語', 1, False]]
|
165 |
+
}
|
166 |
+
]
|
167 |
+
|
168 |
+
|
169 |
+
|
170 |
if __name__ == "__main__":
|
171 |
parser = argparse.ArgumentParser()
|
172 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
173 |
args = parser.parse_args()
|
174 |
+
for info in models_info:
|
175 |
+
name = info['title']
|
176 |
+
lang = info['languages']
|
177 |
+
examples = info['examples']
|
178 |
+
config_path = info['config_path']
|
179 |
+
model_path = info['model_path']
|
180 |
+
hps = utils.get_hparams_from_file(config_path)
|
181 |
+
model = SynthesizerTrn(
|
182 |
+
len(hps.symbols),
|
183 |
+
hps.data.filter_length // 2 + 1,
|
184 |
+
hps.train.segment_size // hps.data.hop_length,
|
185 |
+
n_speakers=hps.data.n_speakers,
|
186 |
+
**hps.model)
|
187 |
+
utils.load_checkpoint(model_path, model, None)
|
188 |
+
model.eval()
|
189 |
+
speaker_ids = hps.speakers
|
190 |
+
speakers = list(hps.speakers.keys())
|
191 |
+
models_tts.append((name, speakers, lang, example,
|
192 |
+
hps.symbols, create_tts_fn(model, hps, speaker_ids),
|
193 |
+
create_to_symbol_fn(hps)))
|
194 |
+
models_vc.append((name, speakers, create_vc_fn(model, hps, speaker_ids)))
|
195 |
app = gr.Blocks()
|
196 |
with app:
|
197 |
+
gr.Markdown("# English & Chinese & Japanese Anime TTS\n\n"
|
198 |
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
|
199 |
+
"Including Japanese TTS & Trilingual TTS, speakers are all anime characters. 包含一个纯日语TTS和一个中日英三语TTS模型,主要为二次元角色。"
|
|
|
|
|
|
|
|
|
200 |
"If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
|
201 |
"若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
|
|
|
|
|
202 |
)
|
203 |
+
with gr.Tabs():
|
204 |
+
with gr.TabItem("TTS"):
|
205 |
+
with gr.Tabs():
|
206 |
+
for i, (name, speakers, lang, example, symbols, tts_fn, to_symbol_fn) in enumerate(models_tts):
|
207 |
+
with gr.TabItem(name)
|
208 |
+
with gr.Row():
|
209 |
+
with gr.Column():
|
210 |
+
textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
|
211 |
+
with gr.Accordion(label="Phoneme Input", open=False):
|
212 |
+
temp_text_var = gr.Variable()
|
213 |
+
symbol_input = gr.Checkbox(value=False, label="Symbol input")
|
214 |
+
symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
|
215 |
+
samples=[[x] for x in symbols],
|
216 |
+
elem_id=f"symbol-list")
|
217 |
+
symbol_list_json = gr.Json(value=symbols, visible=False)
|
218 |
+
symbol_input.change(to_symbol_fn,
|
219 |
+
[symbol_input, textbox, temp_text_var],
|
220 |
+
[textbox, temp_text_var])
|
221 |
+
symbol_list.click(None, [symbol_list, symbol_list_json], textbox,
|
222 |
+
_js=f"""
|
223 |
+
(i, symbols, text) => {{
|
224 |
+
let root = document.querySelector("body > gradio-app");
|
225 |
+
if (root.shadowRoot != null)
|
226 |
+
root = root.shadowRoot;
|
227 |
+
let text_input = root.querySelector("#tts-input").querySelector("textarea");
|
228 |
+
let startPos = text_input.selectionStart;
|
229 |
+
let endPos = text_input.selectionEnd;
|
230 |
+
let oldTxt = text_input.value;
|
231 |
+
let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
|
232 |
+
text_input.value = result;
|
233 |
+
let x = window.scrollX, y = window.scrollY;
|
234 |
+
text_input.focus();
|
235 |
+
text_input.selectionStart = startPos + symbols[i].length;
|
236 |
+
text_input.selectionEnd = startPos + symbols[i].length;
|
237 |
+
text_input.blur();
|
238 |
+
window.scrollTo(x, y);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
+
text = text_input.value;
|
241 |
+
|
242 |
+
return text;
|
243 |
+
}}""")
|
244 |
+
# select character
|
245 |
+
char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
|
246 |
+
language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
|
247 |
+
duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1, label='时长 Duration')
|
248 |
+
with gr.Column():
|
249 |
+
text_output = gr.Textbox(label="Message")
|
250 |
+
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
251 |
+
btn = gr.Button("Generate!")
|
252 |
+
|
253 |
+
download = gr.Button("Download Audio")
|
254 |
+
download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
|
255 |
+
if len(lang) == 1:
|
256 |
+
btn.click(tts_fn, inputs=[textbox, char_dropdown, None, duration_slider, symbol_input],
|
257 |
+
outputs=[text_output, audio_output])
|
258 |
+
else:
|
259 |
+
btn.click(tts_fn, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, symbol_input],
|
260 |
+
outputs=[text_output, audio_output])
|
261 |
+
gr.Examples(
|
262 |
+
examples=example,
|
263 |
+
inputs=[textbox, char_dropdown, language_dropdown,
|
264 |
+
duration_slider, symbol_input],
|
265 |
+
outputs=[text_output, audio_output],
|
266 |
+
fn=tts_fn
|
267 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|