Plachta commited on
Commit
39c9cf7
·
1 Parent(s): e7b849e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -8
app.py CHANGED
@@ -22,6 +22,8 @@ from mel_processing import spectrogram_torch
22
  import translators.server as tss
23
  import psutil
24
  from datetime import datetime
 
 
25
 
26
  def audio_postprocess(self, y):
27
  if y is None:
@@ -44,7 +46,7 @@ def audio_postprocess(self, y):
44
  gr.Audio.postprocess = audio_postprocess
45
 
46
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
47
- languages = ['日本語', '简体中文', 'English']
48
  characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
49
  '4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
50
  '8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
@@ -126,19 +128,73 @@ def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, i
126
  text = tss.google(text_raw, from_language='zh', to_language='ja')
127
  elif language == 'English':
128
  text = tss.google(text_raw, from_language='en', to_language='ja')
 
 
129
  char_id = int(character.split(':')[0])
130
  stn_tst = get_text(text, hps, is_symbol)
131
  with torch.no_grad():
132
  x_tst = stn_tst.unsqueeze(0)
133
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
134
  sid = torch.LongTensor([char_id])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
136
  currentDateAndTime = datetime.now()
137
  print(f"Character {character} inference successful: {text}\n")
138
  if language != '日本語':
139
  print(f"translate from {language}: {text_raw}")
140
  show_memory_info(str(currentDateAndTime) + " infer调用后")
141
- return (text, (22050, audio))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  download_audio_js = """
144
  () =>{{
@@ -173,7 +229,8 @@ if __name__ == "__main__":
173
  "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
174
  "This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
175
  "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
176
- "! ! ! 若有bug��迎及时反馈 ! ! ! QQ:1925208426 \n\n"
 
177
  "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
178
  "如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
179
  )
@@ -181,7 +238,7 @@ if __name__ == "__main__":
181
  with gr.Column():
182
  # We instantiate the Textbox class
183
  textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
184
- with gr.Accordion(label="Advanced Options", open=False):
185
  temp_text_var = gr.Variable()
186
  symbol_input = gr.Checkbox(value=False, label="Symbol input")
187
  symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
@@ -226,9 +283,23 @@ if __name__ == "__main__":
226
  text_output = gr.Textbox(label="Output Text")
227
  audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
228
  btn = gr.Button("Generate!")
229
- btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown,
230
- duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
231
- outputs=[text_output, audio_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  download = gr.Button("Download Audio")
233
  download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
234
  examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
@@ -246,16 +317,24 @@ if __name__ == "__main__":
246
  fn=infer
247
  )
248
  gr.Markdown("# Updates Logs 更新日志:\n\n"
 
 
 
249
  "2023/1/13:\n\n"
250
  "增加了音素输入的example(米浴喘气)\n\n"
 
251
  "2023/1/12:\n\n"
252
  "增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
 
253
  "调整了UI的布局。\n\n"
 
254
  "2023/1/10:\n\n"
255
  "数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
 
256
  "2023/1/9:\n\n"
257
- "人物全是特别周的bug已修复,对此带来的不便感到十分抱歉。\n\n"
258
  "模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
 
259
  "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
 
260
  )
261
  app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
 
22
  import translators.server as tss
23
  import psutil
24
  from datetime import datetime
25
+ import romajitable
26
+ from text.cleaners import japanese_cleaners
27
 
28
  def audio_postprocess(self, y):
29
  if y is None:
 
46
  gr.Audio.postprocess = audio_postprocess
47
 
48
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
49
+ languages = ['日本語', '简体中文', 'English', 'English2Katakana']
50
  characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
51
  '4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
52
  '8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
 
128
  text = tss.google(text_raw, from_language='zh', to_language='ja')
129
  elif language == 'English':
130
  text = tss.google(text_raw, from_language='en', to_language='ja')
131
+ elif language == "English2Katakana":
132
+ text = romajitable.to_kana(text_raw).katakana
133
  char_id = int(character.split(':')[0])
134
  stn_tst = get_text(text, hps, is_symbol)
135
  with torch.no_grad():
136
  x_tst = stn_tst.unsqueeze(0)
137
  x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
138
  sid = torch.LongTensor([char_id])
139
+ jp2phoneme = japanese_cleaners(text)
140
+ durations = net_g.predict_duration(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
141
+ noise_scale_w=noise_scale_w, length_scale=duration)
142
+ char_dur_list = []
143
+ for i, char in enumerate(jp2phoneme):
144
+ char_pos = i * 2 + 1
145
+ char_dur = durations[char_pos]
146
+ char_dur_list.append(char_dur)
147
+ char_spacing_dur_list = []
148
+ char_spacings = []
149
+ for i in range(len(durations)):
150
+ if i % 2 == 0: # spacing
151
+ char_spacings.append("spacing")
152
+ elif i % 2 == 1: # char
153
+ char_spacings.append(jp2phoneme[int((i - 1) / 2)])
154
+ char_spacing_dur_list.append(int(durations[i]))
155
+ # convert duration information to string
156
+ duration_info_str = ""
157
+ for i in range(len(char_spacings)):
158
+ if char_spacings[i] == "spacing":
159
+ duration_info_str += str(char_spacing_dur_list[i])
160
+ else:
161
+ duration_info_str += "{" + char_spacings[i] + ":" + str(char_spacing_dur_list[i]) + "}"
162
+ if i != len(char_spacings)-1:
163
+ duration_info_str += ", "
164
  audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
165
  currentDateAndTime = datetime.now()
166
  print(f"Character {character} inference successful: {text}\n")
167
  if language != '日本語':
168
  print(f"translate from {language}: {text_raw}")
169
  show_memory_info(str(currentDateAndTime) + " infer调用后")
170
+ return (text,(22050, audio), jp2phoneme, duration_info_str)
171
+
172
+ def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale, noise_scale_w):
173
+ try:
174
+ phonemes = duration_info_str.split(", ")
175
+ recons_durs = []
176
+ recons_phonemes = ""
177
+ for item in phonemes:
178
+ if "{" not in item: # spacing
179
+ recons_durs.append(int(item))
180
+ else:
181
+ recons_phonemes += item.strip("{}").split(":")[0]
182
+ recons_durs.append(int(item.strip("{}").split(":")[1]))
183
+ except ValueError:
184
+ return ("Error: Format must not be changed!", None)
185
+ except AssertionError:
186
+ return ("Error: Format must not be changed!", None)
187
+ char_id = int(character.split(':')[0])
188
+ stn_tst = get_text(recons_phonemes, hps, is_symbol=True)
189
+ with torch.no_grad():
190
+ x_tst = stn_tst.unsqueeze(0)
191
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
192
+ sid = torch.LongTensor([char_id])
193
+ print(len(recons_durs))
194
+ print(x_tst.shape[1])
195
+ audio = net_g.infer_with_duration(x_tst, x_tst_lengths, w_ceil=recons_durs, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
196
+ length_scale=duration)[0][0, 0].data.cpu().float().numpy()
197
+ return (recons_phonemes, (22050, audio))
198
 
199
  download_audio_js = """
200
  () =>{{
 
229
  "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
230
  "This model has been integrated to the model collections of [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts).\n\n"
231
  "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
232
+ "If you have any suggestions or bug reports, feel free to open discussion in Community.\n\n"
233
+ "若有bug反馈或建议,请在Community下开启一个新的Discussion。 \n\n"
234
  "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
235
  "如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
236
  )
 
238
  with gr.Column():
239
  # We instantiate the Textbox class
240
  textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
241
+ with gr.Accordion(label="Phoneme Input", open=False):
242
  temp_text_var = gr.Variable()
243
  symbol_input = gr.Checkbox(value=False, label="Symbol input")
244
  symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
 
283
  text_output = gr.Textbox(label="Output Text")
284
  audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
285
  btn = gr.Button("Generate!")
286
+ with gr.Accordion(label="Speaking Pace Control", open=True):
287
+ phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
288
+ duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here. You can edit phoneme durations here and click regenerate for more precise control.",
289
+ interactive = True)
290
+ gr.Markdown(
291
+ "\{ \}内的数字代表每个音素在生成的音频中的长度,\{ \}外的数字代表音素之间间隔的长度。"
292
+ "您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
293
+ "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
294
+ "The numbers inside \{ \} represent the length for each phoneme in the generated audio, while the numbers out of \{ \} represent the length of spacings between phonemes."
295
+ "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled."
296
+ "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
297
+ )
298
+ cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
299
+ btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
300
+ outputs=[text_output, audio_output, phoneme_output, duration_output])
301
+ cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
302
+ outputs=[phoneme_output, audio_output])
303
  download = gr.Button("Download Audio")
304
  download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
305
  examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
 
317
  fn=infer
318
  )
319
  gr.Markdown("# Updates Logs 更新日志:\n\n"
320
+ "2023/1/24:\n\n"
321
+ "增加了对说话节奏的音素级控制。\n\n"
322
+ "Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
323
  "2023/1/13:\n\n"
324
  "增加了音素输入的example(米浴喘气)\n\n"
325
+ "Added one example of phoneme input.\n\n"
326
  "2023/1/12:\n\n"
327
  "增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
328
+ "Added phoneme input, which enables more precise control on output audio.\n\n"
329
  "调整了UI的布局。\n\n"
330
+ "Adjusted UI arrangements.\n\n"
331
  "2023/1/10:\n\n"
332
  "数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
333
+ "Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
334
  "2023/1/9:\n\n"
 
335
  "模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
336
+ "Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
337
  "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
338
+ "Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
339
  )
340
  app.queue(concurrency_count=3).launch(show_api=False, share=args.share)