Mahiruoshi
commited on
Commit
·
6c297e8
1
Parent(s):
c3497ff
Update main.py
Browse files
main.py
CHANGED
@@ -45,9 +45,8 @@ def extrac(text):
|
|
45 |
i = romajitable.to_kana(i).katakana
|
46 |
i = i.replace('\n','').replace(' ','')
|
47 |
#Current length of single sentence: 20
|
48 |
-
'''
|
49 |
if len(i)>1:
|
50 |
-
if len(i) >
|
51 |
try:
|
52 |
cur_list = re.split(r'。|!', i)
|
53 |
for i in cur_list:
|
@@ -59,6 +58,7 @@ def extrac(text):
|
|
59 |
final_list.append(i)
|
60 |
'''
|
61 |
final_list.append(i)
|
|
|
62 |
final_list = [x for x in final_list if x != '']
|
63 |
print(final_list)
|
64 |
return final_list
|
@@ -121,7 +121,7 @@ def sle(language,text):
|
|
121 |
return text
|
122 |
|
123 |
def get_text(text,hps_ms):
|
124 |
-
text_norm = text_to_sequence(text,hps_ms.data.text_cleaners)
|
125 |
if hps_ms.data.add_blank:
|
126 |
text_norm = commons.intersperse(text_norm, 0)
|
127 |
text_norm = torch.LongTensor(text_norm)
|
@@ -129,9 +129,11 @@ def get_text(text,hps_ms):
|
|
129 |
|
130 |
def create_tts_fn(net_g,hps,speaker_id):
|
131 |
speaker_id = int(speaker_id)
|
132 |
-
def tts_fn(history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
|
133 |
text = check_text(text)
|
134 |
repeat_time = int(repeat_time)
|
|
|
|
|
135 |
if is_gpt:
|
136 |
openai.api_key = api_key
|
137 |
text = chatgpt(text)
|
@@ -166,40 +168,56 @@ def create_tts_fn(net_g,hps,speaker_id):
|
|
166 |
for i in b:
|
167 |
text = text.replace(i,'>')
|
168 |
final_list = extrac(text.replace('“','').replace('”',''))
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
|
204 |
return tts_fn
|
205 |
|
@@ -460,7 +478,6 @@ if __name__ == '__main__':
|
|
460 |
output1 = gr.Audio(label="采样率22050")
|
461 |
with gr.Accordion(label="Setting", open=False):
|
462 |
input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
|
463 |
-
input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
|
464 |
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
|
465 |
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
|
466 |
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
|
@@ -468,17 +485,21 @@ if __name__ == '__main__':
|
|
468 |
audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
|
469 |
api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
|
470 |
api_input2 = gr.TextArea(label="api-key",lines=1,value = '懂得都懂')
|
|
|
471 |
output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
|
472 |
-
audio_input1 = gr.Checkbox(value=False, label="
|
473 |
audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/path/to/live2d/sounds/temp.wav')
|
474 |
input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
|
475 |
inputxt = gr.File(label="Text")
|
|
|
|
|
|
|
476 |
btnbook = gr.Button("小说合成")
|
477 |
btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then(
|
478 |
-
tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
|
479 |
)
|
480 |
btnbook.click(bot, inputs = [chatbot,inputxt], outputs = [chatbot]).then(
|
481 |
-
tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
|
482 |
)
|
483 |
with gr.Tab("Voice Conversion(类似sovits)"):
|
484 |
gr.Markdown("""
|
|
|
45 |
i = romajitable.to_kana(i).katakana
|
46 |
i = i.replace('\n','').replace(' ','')
|
47 |
#Current length of single sentence: 20
|
|
|
48 |
if len(i)>1:
|
49 |
+
if len(i) > 50:
|
50 |
try:
|
51 |
cur_list = re.split(r'。|!', i)
|
52 |
for i in cur_list:
|
|
|
58 |
final_list.append(i)
|
59 |
'''
|
60 |
final_list.append(i)
|
61 |
+
'''
|
62 |
final_list = [x for x in final_list if x != '']
|
63 |
print(final_list)
|
64 |
return final_list
|
|
|
121 |
return text
|
122 |
|
123 |
def get_text(text,hps_ms):
|
124 |
+
text_norm = text_to_sequence(text,hps_ms.symbols,hps_ms.data.text_cleaners)
|
125 |
if hps_ms.data.add_blank:
|
126 |
text_norm = commons.intersperse(text_norm, 0)
|
127 |
text_norm = torch.LongTensor(text_norm)
|
|
|
129 |
|
130 |
def create_tts_fn(net_g,hps,speaker_id):
|
131 |
speaker_id = int(speaker_id)
|
132 |
+
def tts_fn(is_transfer,original_speaker, target_speaker,history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
|
133 |
text = check_text(text)
|
134 |
repeat_time = int(repeat_time)
|
135 |
+
original_speaker_id = selection(original_speaker)
|
136 |
+
target_speaker_id = selection(target_speaker)
|
137 |
if is_gpt:
|
138 |
openai.api_key = api_key
|
139 |
text = chatgpt(text)
|
|
|
168 |
for i in b:
|
169 |
text = text.replace(i,'>')
|
170 |
final_list = extrac(text.replace('“','').replace('”',''))
|
171 |
+
split_list = []
|
172 |
+
while len(final_list) > 0:
|
173 |
+
split_list.append(final_list[:500])
|
174 |
+
final_list = final_list[500:]
|
175 |
+
c0 = 0
|
176 |
+
for lists in split_list:
|
177 |
+
audio_fin = []
|
178 |
+
t = datetime.timedelta(seconds=0)
|
179 |
+
c = 0
|
180 |
+
f1 = open(audiopath.replace('.wav',str(c0)+".srt"),'w',encoding='utf-8')
|
181 |
+
for sentence in lists:
|
182 |
+
try:
|
183 |
+
c +=1
|
184 |
+
stn_tst = get_text(sle(language,sentence),hps)
|
185 |
+
with torch.no_grad():
|
186 |
+
x_tst = stn_tst.unsqueeze(0).to(dev)
|
187 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
188 |
+
sid = torch.LongTensor([original_speaker_id]).to(dev)
|
189 |
+
t1 = time.time()
|
190 |
+
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
|
191 |
+
t2 = time.time()
|
192 |
+
spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
|
193 |
+
print(spending_time)
|
194 |
+
time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
|
195 |
+
last_time = datetime.timedelta(seconds=len(audio)/float(22050))
|
196 |
+
t+=last_time
|
197 |
+
time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
|
198 |
+
print(time_end)
|
199 |
+
f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
|
200 |
+
if is_transfer:
|
201 |
+
with torch.no_grad():
|
202 |
+
y = torch.FloatTensor(audio)
|
203 |
+
y = y / max(-y.min(), y.max()) / 0.99
|
204 |
+
y = y.to(dev)
|
205 |
+
y = y.unsqueeze(0)
|
206 |
+
spec = spectrogram_torch(y, hps.data.filter_length,
|
207 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
208 |
+
center=False).to(dev)
|
209 |
+
spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
|
210 |
+
sid_src = torch.LongTensor([original_speaker_id]).to(dev)
|
211 |
+
sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
|
212 |
+
audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
213 |
+
0, 0].data.cpu().float().numpy()
|
214 |
+
del y, spec, spec_lengths, sid_src, sid_tgt
|
215 |
+
audio_fin.append(audio)
|
216 |
+
except:
|
217 |
+
pass
|
218 |
+
write(audiopath.replace('.wav',str(c0)+'.wav'),22050,np.concatenate(audio_fin))
|
219 |
+
c0 += 1
|
220 |
+
file_path = audiopath.replace('.wav',str(c0)+".srt")
|
221 |
return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
|
222 |
return tts_fn
|
223 |
|
|
|
478 |
output1 = gr.Audio(label="采样率22050")
|
479 |
with gr.Accordion(label="Setting", open=False):
|
480 |
input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
|
|
|
481 |
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
|
482 |
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
|
483 |
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
|
|
|
485 |
audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
|
486 |
api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
|
487 |
api_input2 = gr.TextArea(label="api-key",lines=1,value = '懂得都懂')
|
488 |
+
with gr.Accordion(label="Advanced Setting", open=False):
|
489 |
output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
|
490 |
+
audio_input1 = gr.Checkbox(value=False, label="保存路径")
|
491 |
audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/path/to/live2d/sounds/temp.wav')
|
492 |
input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
|
493 |
inputxt = gr.File(label="Text")
|
494 |
+
is_transfer = gr.Checkbox(value=False, label="是否声线转化")
|
495 |
+
source_speaker = gr.Dropdown(choices=idols, value=name, label="source speaker")
|
496 |
+
target_speaker = gr.Dropdown(choices=idols, value=name, label="target speaker")
|
497 |
btnbook = gr.Button("小说合成")
|
498 |
btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then(
|
499 |
+
tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
|
500 |
)
|
501 |
btnbook.click(bot, inputs = [chatbot,inputxt], outputs = [chatbot]).then(
|
502 |
+
tts_fn, inputs=[is_transfer,source_speaker,target_speaker,chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
|
503 |
)
|
504 |
with gr.Tab("Voice Conversion(类似sovits)"):
|
505 |
gr.Markdown("""
|