Mahiruoshi commited on
Commit
d94bbcb
1 Parent(s): 6379bc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -336
app.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
  logging.getLogger('numba').setLevel(logging.WARNING)
3
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
4
  logging.getLogger('urllib3').setLevel(logging.WARNING)
5
- import romajitable
6
  import re
7
  import numpy as np
8
  import IPython.display as ipd
@@ -15,251 +15,97 @@ import gradio as gr
15
  import time
16
  import datetime
17
  import os
18
- import librosa
19
- class VitsGradio:
20
- def __init__(self):
21
- self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22
- self.lan = ["中文","日文","自动","手动"]
23
- self.idols = ["c1","c2","高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","華恋","まひる","なな","クロディーヌ","ひかり",'純那',"香子","真矢","双葉","ミチル","メイファン","やちよ","晶","いちえ","ゆゆ子","塁","珠緒","あるる","ララフィン","美空","静羽","あるる"]
24
- self.modelPaths = []
25
- for root,dirs,files in os.walk("checkpoints"):
26
- for dir in dirs:
27
- self.modelPaths.append(dir)
28
- with gr.Blocks() as self.Vits:
29
- gr.Markdown(
30
- "## <center> Lovelive虹团中日双语VITS\n"
31
- "### <center> 请不要生成会对个人以及企划造成侵害的内容\n"
32
- "<div align='center'>目前有标贝普通话版,去标贝版,少歌模型还是大饼状态</div>"
33
- '<div align="center"><a>参数说明:由于爱抖露们过于有感情,合成日语时建议将噪声比例调节至0.2-0.3区间,噪声偏差对应着每个字之间的间隔,对普通话影响较大,duration代表整体语速</div>'
34
- '<div align="center"><a>合成前请先选择模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>')
35
- with gr.Tab("TTS合成"):
36
- with gr.Row():
37
- with gr.Column():
38
- with gr.Row():
39
- with gr.Column():
40
- input1 = gr.TextArea(label="Text", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
41
- input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
42
- input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
43
- btnVC = gr.Button("Submit")
44
- with gr.Column():
45
- input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
46
- input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
47
- input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
48
- output1 = gr.Audio(label="采样率22050")
49
- btnVC.click(self.infer, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1])
50
- with gr.Tab("选择模型"):
51
- with gr.Column():
52
- modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value")
53
- btnMod = gr.Button("载入模型")
54
- statusa = gr.TextArea()
55
- btnMod.click(self.loadCk, inputs=[modelstrs], outputs = [statusa])
56
- with gr.Tab("Voice Conversion"):
57
- gr.Markdown("""
58
- 录制或上传声音,并选择要转换的音色。
59
- """)
60
- with gr.Column():
61
- record_audio = gr.Audio(label="record your voice", source="microphone")
62
- upload_audio = gr.Audio(label="or upload audio here", source="upload")
63
- source_speaker = gr.Dropdown(choices=self.idols, value="歩夢", label="source speaker")
64
- target_speaker = gr.Dropdown(choices=self.idols, value="歩夢", label="target speaker")
65
- with gr.Column():
66
- message_box = gr.Textbox(label="Message")
67
- converted_audio = gr.Audio(label='converted audio')
68
- btn = gr.Button("Convert!")
69
- btn.click(self.vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
70
- outputs=[message_box, converted_audio])
71
- with gr.Tab("小说合成(带字幕)"):
72
- with gr.Row():
73
- with gr.Column():
74
- with gr.Row():
75
- with gr.Column():
76
- input1 = gr.TextArea(label="建议colab或本地克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
77
- input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
78
- input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
79
- btnVC = gr.Button("Submit")
80
- with gr.Column():
81
- input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
82
- input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
83
- input6 = gr.Slider(minimum=0.1, maximum=10, label="Duration", value=1)
84
- output1 = gr.Audio(label="采样率22050")
85
- subtitle = gr.outputs.File(label="字幕文件:subtitles.srt")
86
- btnVC.click(self.infer2, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1,subtitle])
87
-
88
- def loadCk(self,path):
89
- self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
90
- n_symbols = len(self.hps.symbols) if 'symbols' in self.hps.keys() else 0
91
- self.net_g = SynthesizerTrn(
92
- n_symbols,
93
- self.hps.data.filter_length // 2 + 1,
94
- self.hps.train.segment_size // self.hps.data.hop_length,
95
- n_speakers=self.hps.data.n_speakers,
96
- **self.hps.model).to(self.dev)
97
- _ = self.net_g.eval()
98
- _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g)
99
- return "success"
100
-
101
- def get_text(self,text):
102
- text_norm = text_to_sequence(text,self.hps.symbols,self.hps.data.text_cleaners)
103
- if self.hps.data.add_blank:
104
- text_norm = commons.intersperse(text_norm, 0)
105
- text_norm = torch.LongTensor(text_norm)
106
- return text_norm
107
-
108
- def is_japanese(self,string):
109
  for ch in string:
110
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
111
  return True
112
  return False
113
-
114
- def is_english(self,string):
115
  import re
116
  pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
117
  if pattern.fullmatch(string):
118
  return True
119
  else:
120
  return False
121
-
122
- def selection(self,speaker):
123
- if speaker == "高咲侑":
124
- spk = 0
125
- return spk
126
-
127
- elif speaker == "歩夢":
128
- spk = 1
129
- return spk
130
-
131
- elif speaker == "かすみ":
132
- spk = 2
133
- return spk
134
 
135
- elif speaker == "しずく":
136
- spk = 3
137
- return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- elif speaker == "果林":
140
- spk = 4
141
- return spk
142
-
143
- elif speaker == "愛":
144
- spk = 5
145
- return spk
146
-
147
- elif speaker == "彼方":
148
- spk = 6
149
- return spk
150
-
151
- elif speaker == "せつ菜":
152
- spk = 7
153
- return spk
154
- elif speaker == "エマ":
155
- spk = 8
156
- return spk
157
- elif speaker == "璃奈":
158
- spk = 9
159
- return spk
160
- elif speaker == "栞子":
161
- spk = 10
162
- return spk
163
- elif speaker == "ランジュ":
164
- spk = 11
165
- return spk
166
- elif speaker == "ミア":
167
- spk = 12
168
- return spk
169
-
170
- elif speaker == "派蒙":
171
- spk = 16
172
- return spk
173
-
174
- elif speaker == "c1":
175
- spk = 18
176
- return spk
177
 
178
- elif speaker == "c2":
179
- spk = 19
180
- return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- elif speaker == "華恋":
183
- spk = 21
184
- return spk
 
 
185
 
186
- elif speaker == "まひる":
187
- spk = 22
188
- return spk
189
-
190
- elif speaker == "なな":
191
- spk = 23
192
- return spk
193
-
194
- elif speaker == "クロディーヌ":
195
- spk = 24
196
- return spk
197
-
198
- elif speaker == "ひかり":
199
- spk = 25
200
- return spk
201
-
202
- elif speaker == "純那":
203
- spk = 26
204
- return spk
205
-
206
- elif speaker == "香子":
207
- spk = 27
208
- return spk
209
-
210
- elif speaker == "真矢":
211
- spk = 28
212
- return spk
213
- elif speaker == "双葉":
214
- spk = 29
215
- return spk
216
- elif speaker == "ミチル":
217
- spk = 30
218
- return spk
219
- elif speaker == "メイファン":
220
- spk = 31
221
- return spk
222
- elif speaker == "やちよ":
223
- spk = 32
224
- return spk
225
- elif speaker == "晶":
226
- spk = 33
227
- return spk
228
- elif speaker == "いちえ":
229
- spk = 34
230
- return spk
231
- elif speaker == "ゆゆ子":
232
- spk = 35
233
- return spk
234
- elif speaker == "塁":
235
- spk = 36
236
- return spk
237
- elif speaker == "珠緒":
238
- spk = 37
239
- return spk
240
- elif speaker == "あるる":
241
- spk = 38
242
- return spk
243
- elif speaker == "ララフィン":
244
- spk = 39
245
- return spk
246
- elif speaker == "美空":
247
- spk = 40
248
- return spk
249
- elif speaker == "静羽":
250
- spk = 41
251
- return spk
252
- else:
253
- return 0
254
-
255
-
256
- def sle(self,language,text):
257
- text = text.replace('\n','。').replace(' ',',')
258
  if language == "中文":
259
  tts_input1 = "[ZH]" + text + "[ZH]"
260
  return tts_input1
261
  elif language == "自动":
262
- tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]"
263
  return tts_input1
264
  elif language == "日文":
265
  tts_input1 = "[JA]" + text + "[JA]"
@@ -269,119 +115,161 @@ class VitsGradio:
269
  return tts_input1
270
  elif language == "手动":
271
  return text
272
-
273
- def extrac(self,text):
274
- text = re.sub("<[^>]*>","",text)
275
- result_list = re.split(r'\n', text)
276
- final_list = []
277
- for i in result_list:
278
- if self.is_english(i):
279
- i = romajitable.to_kana(i).katakana
280
- i = i.replace('\n','').replace(' ','')
281
- #Current length of single sentence: 20
282
- if len(i)>1:
283
- if len(i) > 20:
284
- try:
285
- cur_list = re.split(r'。|!', i)
286
- for i in cur_list:
287
- if len(i)>1:
288
- final_list.append(i+'。')
289
- except:
290
- pass
291
- else:
292
- final_list.append(i)
293
- final_list = [x for x in final_list if x != '']
294
- print(final_list)
295
- return final_list
296
-
297
- def vc_fn(self,original_speaker, target_speaker, record_audio, upload_audio):
298
- input_audio = record_audio if record_audio is not None else upload_audio
299
- if input_audio is None:
300
- return "You need to record or upload an audio", None
301
- sampling_rate, audio = input_audio
302
- original_speaker_id = self.selection(original_speaker)
303
- target_speaker_id = self.selection(target_speaker)
304
 
305
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
306
- if len(audio.shape) > 1:
307
- audio = librosa.to_mono(audio.transpose(1, 0))
308
- if sampling_rate != self.hps.data.sampling_rate:
309
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=self.hps.data.sampling_rate)
310
- with torch.no_grad():
311
- y = torch.FloatTensor(audio)
312
- y = y / max(-y.min(), y.max()) / 0.99
313
- y = y.to(self.dev)
314
- y = y.unsqueeze(0)
315
- spec = spectrogram_torch(y, self.hps.data.filter_length,
316
- self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
317
- center=False).to(self.dev)
318
- spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.dev)
319
- sid_src = torch.LongTensor([original_speaker_id]).to(self.dev)
320
- sid_tgt = torch.LongTensor([target_speaker_id]).to(self.dev)
321
- audio = self.net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
322
- 0, 0].data.cpu().float().numpy()
323
- del y, spec, spec_lengths, sid_src, sid_tgt
324
- return "Success", (self.hps.data.sampling_rate, audio)
325
-
326
- def infer(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
327
- try:
328
- speaker_id = int(self.selection(speaker_id))
329
  t1 = time.time()
330
- stn_tst = self.get_text(self.sle(language,text))
331
  with torch.no_grad():
332
- x_tst = stn_tst.unsqueeze(0).to(self.dev)
333
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
334
- sid = torch.LongTensor([speaker_id]).to(self.dev)
335
- audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
336
  t2 = time.time()
337
  spending_time = "推理时间为:"+str(t2-t1)+"s"
338
  print(spending_time)
339
- return (self.hps.data.sampling_rate, audio)
340
- except:
341
- self.hps = utils.get_hparams_from_file(f"checkpoints/biaobei/config.json")
342
- self.net_g = SynthesizerTrn(
343
- len(symbols),
344
- self.hps.data.filter_length // 2 + 1,
345
- self.hps.train.segment_size // self.hps.data.hop_length,
346
- n_speakers=self.hps.data.n_speakers,
347
- **self.hps.model).to(self.dev)
348
- _ = self.net_g.eval()
349
- _ = utils.load_checkpoint(f"checkpoints/biaobei/model.pth", self.net_g)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
- def infer2(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
352
- speaker_id = int(self.selection(speaker_id))
353
- a = ['【','[','(','(']
354
- b = ['',']',')',')']
355
- for i in a:
356
- text = text.replace(i,'<')
357
- for i in b:
358
- text = text.replace(i,'>')
359
- final_list = self.extrac(text.replace('“','').replace('”',''))
360
- audio_fin = []
361
- c = 0
362
- t = datetime.timedelta(seconds=0)
363
- f1 = open("subtitles.srt",'w',encoding='utf-8')
364
- for sentence in final_list:
365
- c +=1
366
- stn_tst = self.get_text(self.sle(language,sentence))
367
- with torch.no_grad():
368
- x_tst = stn_tst.unsqueeze(0).to(self.dev)
369
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
370
- sid = torch.LongTensor([speaker_id]).to(self.dev)
371
- t1 = time.time()
372
- audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
373
- t2 = time.time()
374
- spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
375
- print(spending_time)
376
- time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
377
- last_time = datetime.timedelta(seconds=len(audio)/float(22050))
378
- t+=last_time
379
- time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
380
- print(time_end)
381
- f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
382
- audio_fin.append(audio)
383
- file_path = "subtitles.srt"
384
- return (self.hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
385
- print("开始部署")
386
- grVits = VitsGradio()
387
- grVits.Vits.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  logging.getLogger('numba').setLevel(logging.WARNING)
3
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
4
  logging.getLogger('urllib3').setLevel(logging.WARNING)
5
+ import json
6
  import re
7
  import numpy as np
8
  import IPython.display as ipd
 
15
  import time
16
  import datetime
17
  import os
18
+ import pickle
19
+ import openai
20
+ from scipy.io.wavfile import write
21
+ def is_japanese(string):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  for ch in string:
23
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
24
  return True
25
  return False
26
+
27
+ def is_english(string):
28
  import re
29
  pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
30
  if pattern.fullmatch(string):
31
  return True
32
  else:
33
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ def extrac(text):
36
+ text = re.sub("<[^>]*>","",text)
37
+ result_list = re.split(r'\n', text)
38
+ final_list = []
39
+ for i in result_list:
40
+ if is_english(i):
41
+ i = romajitable.to_kana(i).katakana
42
+ i = i.replace('\n','').replace(' ','')
43
+ #Current length of single sentence: 20
44
+ '''
45
+ if len(i)>1:
46
+ if len(i) > 20:
47
+ try:
48
+ cur_list = re.split(r'。|!', i)
49
+ for i in cur_list:
50
+ if len(i)>1:
51
+ final_list.append(i+'。')
52
+ except:
53
+ pass
54
+ else:
55
+ final_list.append(i)
56
+ '''
57
+ final_list.append(i)
58
+ final_list = [x for x in final_list if x != '']
59
+ print(final_list)
60
+ return final_list
61
 
62
+ def to_numpy(tensor: torch.Tensor):
63
+ return tensor.detach().cpu().numpy() if tensor.requires_grad \
64
+ else tensor.detach().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ def chatgpt(text):
67
+ messages = []
68
+ try:
69
+ if text != 'exist':
70
+ with open('log.pickle', 'rb') as f:
71
+ messages = pickle.load(f)
72
+ messages.append({"role": "user", "content": text},)
73
+ chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
74
+ reply = chat.choices[0].message.content
75
+ messages.append({"role": "assistant", "content": reply})
76
+ print(messages[-1])
77
+ if len(messages) == 12:
78
+ messages[6:10] = messages[8:]
79
+ del messages[-2:]
80
+ with open('log.pickle', 'wb') as f:
81
+ pickle.dump(messages, f)
82
+ return reply
83
+ except:
84
+ messages.append({"role": "user", "content": text},)
85
+ chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
86
+ reply = chat.choices[0].message.content
87
+ messages.append({"role": "assistant", "content": reply})
88
+ print(messages[-1])
89
+ if len(messages) == 12:
90
+ messages[6:10] = messages[8:]
91
+ del messages[-2:]
92
+ with open('log.pickle', 'wb') as f:
93
+ pickle.dump(messages, f)
94
+ return reply
95
 
96
+ def get_symbols_from_json(path):
97
+ assert os.path.isfile(path)
98
+ with open(path, 'r') as f:
99
+ data = json.load(f)
100
+ return data['symbols']
101
 
102
+ def sle(language,text):
103
+ text = text.replace('\n', '').replace('\r', '').replace(" ", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  if language == "中文":
105
  tts_input1 = "[ZH]" + text + "[ZH]"
106
  return tts_input1
107
  elif language == "自动":
108
+ tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]"
109
  return tts_input1
110
  elif language == "日文":
111
  tts_input1 = "[JA]" + text + "[JA]"
 
115
  return tts_input1
116
  elif language == "手动":
117
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ def get_text(text,hps_ms):
120
+ text_norm = text_to_sequence(text,hps_ms.symbols,hps_ms.data.text_cleaners)
121
+ if hps_ms.data.add_blank:
122
+ text_norm = commons.intersperse(text_norm, 0)
123
+ text_norm = torch.LongTensor(text_norm)
124
+ return text_norm
125
+
126
+ def create_tts_fn(net_g,hps,speaker_id):
127
+ speaker_id = int(speaker_id)
128
+ def tts_fn(history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
129
+ repeat_time = int(repeat_time)
130
+ if is_gpt:
131
+ openai.api_key = api_key
132
+ text = chatgpt(text)
133
+ history[-1][1] = text
134
+ if not extract:
135
+ print(text)
 
 
 
 
 
 
 
136
  t1 = time.time()
137
+ stn_tst = get_text(sle(language,text),hps)
138
  with torch.no_grad():
139
+ x_tst = stn_tst.unsqueeze(0).to(dev)
140
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
141
+ sid = torch.LongTensor([speaker_id]).to(dev)
142
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
143
  t2 = time.time()
144
  spending_time = "推理时间为:"+str(t2-t1)+"s"
145
  print(spending_time)
146
+ file_path = "subtitles.srt"
147
+ try:
148
+ write(audiopath + '.wav',22050,audio)
149
+ if is_audio:
150
+ for i in range(repeat_time):
151
+ cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
152
+ os.system(cmd)
153
+ except:
154
+ pass
155
+ return history,file_path,(hps.data.sampling_rate,audio)
156
+ else:
157
+ a = ['【','[','(','(']
158
+ b = ['】',']',')',')']
159
+ for i in a:
160
+ text = text.replace(i,'<')
161
+ for i in b:
162
+ text = text.replace(i,'>')
163
+ final_list = extrac(text.replace('“','').replace('”',''))
164
+ audio_fin = []
165
+ c = 0
166
+ t = datetime.timedelta(seconds=0)
167
+ f1 = open("subtitles.srt",'w',encoding='utf-8')
168
+ for sentence in final_list:
169
+ c +=1
170
+ stn_tst = get_text(sle(language,sentence),hps)
171
+ with torch.no_grad():
172
+ x_tst = stn_tst.unsqueeze(0).to(dev)
173
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
174
+ sid = torch.LongTensor([speaker_id]).to(dev)
175
+ t1 = time.time()
176
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
177
+ t2 = time.time()
178
+ spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
179
+ print(spending_time)
180
+ time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
181
+ last_time = datetime.timedelta(seconds=len(audio)/float(22050))
182
+ t+=last_time
183
+ time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
184
+ print(time_end)
185
+ f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
186
+ audio_fin.append(audio)
187
+ try:
188
+ write(audiopath + '.wav',22050,np.concatenate(audio_fin))
189
+ if is_audio:
190
+ for i in range(repeat_time):
191
+ cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
192
+ os.system(cmd)
193
+
194
+ except:
195
+ pass
196
+
197
+ file_path = "subtitles.srt"
198
+ return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
199
+ return tts_fn
200
 
201
+ def bot(history,user_message):
202
+ return history + [[user_message, None]]
203
+
204
+ if __name__ == '__main__':
205
+ hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
206
+ dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
207
+ models = []
208
+ schools = ["Seisho-Nijigasaki","Seisho-betterchinese","Nijigasaki","Nijigasaki-biaobei"]
209
+ lan = ["中文","日文","自动","手动"]
210
+ with open("checkpoints/info.json", "r", encoding="utf-8") as f:
211
+ models_info = json.load(f)
212
+ for i in models_info:
213
+ checkpoint = models_info[i]["checkpoint"]
214
+ phone_dict = {
215
+ symbol: i for i, symbol in enumerate(hps.symbols)
216
+ }
217
+ n_symbols = len(hps.symbols) if 'symbols' in hps.keys() else 0
218
+ net_g = SynthesizerTrn(
219
+ n_symbols,
220
+ hps.data.filter_length // 2 + 1,
221
+ hps.train.segment_size // hps.data.hop_length,
222
+ n_speakers=hps.data.n_speakers,
223
+ **hps.model).to(dev)
224
+ _ = net_g.eval()
225
+ _ = utils.load_checkpoint(checkpoint, net_g)
226
+ school = models_info[i]
227
+ speakers = school["speakers"]
228
+ content = []
229
+ for j in speakers:
230
+ sid = int(speakers[j]['sid'])
231
+ title = school
232
+ example = speakers[j]['speech']
233
+ name = speakers[j]["name"]
234
+ content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
235
+ models.append(content)
236
+
237
+ with gr.Blocks() as app:
238
+ with gr.Tabs():
239
+ for i in schools:
240
+ with gr.TabItem(i):
241
+ for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
242
+ with gr.TabItem(name):
243
+ with gr.Column():
244
+ with gr.Row():
245
+ with gr.Row():
246
+ gr.Markdown(
247
+ '<div align="center">'
248
+ f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
249
+ '</div>'
250
+ )
251
+ chatbot = gr.Chatbot()
252
+ with gr.Row():
253
+ with gr.Column(scale=0.85):
254
+ input1 = gr.TextArea(label="Text", value=example,lines = 1)
255
+ with gr.Column(scale=0.15, min_width=0):
256
+ btnVC = gr.Button("Send")
257
+ output1 = gr.Audio(label="采样率22050")
258
+ with gr.Accordion(label="Setting", open=False):
259
+ input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
260
+ input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
261
+ input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
262
+ input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
263
+ input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
264
+ with gr.Accordion(label="Advanced Setting", open=False):
265
+ audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
266
+ api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
267
+ api_input2 = gr.TextArea(label="api-key",lines=1,value = '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')
268
+ output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
269
+ audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
270
+ audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
271
+ btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then(
272
+ tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
273
+ )
274
+
275
+ app.launch()