Files changed (1) hide show
  1. app.py +328 -567
app.py CHANGED
@@ -1,24 +1,13 @@
1
- import gradio as gr
2
- import numpy as np
3
- import soundfile as sf
4
- from datetime import datetime
5
- from time import time as ttime
6
- from my_utils import load_audio
7
- from transformers import pipeline
8
- from text.cleaner import clean_text
9
- from polyglot.detect import Detector
10
- from feature_extractor import cnhubert
11
- from timeit import default_timer as timer
12
- from text import cleaned_text_to_sequence
13
- from module.models import SynthesizerTrn
14
- from module.mel_processing import spectrogram_torch
15
- from transformers.pipelines.audio_utils import ffmpeg_read
16
- import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
17
- from transformers import AutoModelForMaskedLM, AutoTokenizer
18
- from AR.models.t2s_lightning_module import Text2SemanticLightningModule
19
-
20
-
21
- import logging
22
  logging.getLogger("markdown_it").setLevel(logging.ERROR)
23
  logging.getLogger("urllib3").setLevel(logging.ERROR)
24
  logging.getLogger("httpcore").setLevel(logging.ERROR)
@@ -26,43 +15,67 @@ logging.getLogger("httpx").setLevel(logging.ERROR)
26
  logging.getLogger("asyncio").setLevel(logging.ERROR)
27
  logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
28
  logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
29
- logging.getLogger("multipart").setLevel(logging.WARNING)
30
- from download import *
31
- download()
 
 
 
 
 
 
 
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if "_CUDA_VISIBLE_DEVICES" in os.environ:
34
  os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
35
- tz = pytz.timezone('Asia/Singapore')
36
- device = "cuda" if torch.cuda.is_available() else "cpu"
37
-
38
- def abs_path(dir):
39
- global_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
40
- return(os.path.join(global_dir, dir))
41
- gpt_path = abs_path("MODELS/22/22.ckpt")
42
- sovits_path=abs_path("MODELS/22/22.pth")
43
- cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
44
- bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")
45
-
46
- if not os.path.exists(cnhubert_base_path):
47
- cnhubert_base_path = "TencentGameMate/chinese-hubert-base"
48
- if not os.path.exists(bert_path):
49
- bert_path = "hfl/chinese-roberta-wwm-ext-large"
50
  cnhubert.cnhubert_base_path = cnhubert_base_path
51
 
52
- whisper_path = os.environ.get("whisper_path", "pretrained_models/whisper-tiny")
53
- if not os.path.exists(whisper_path):
54
- whisper_path = "openai/whisper-tiny"
 
 
 
 
 
55
 
56
- pipe = pipeline(
57
- task="automatic-speech-recognition",
58
- model=whisper_path,
59
- chunk_length_s=30,
60
- device=device,)
61
 
 
62
 
63
- is_half = eval(
64
- os.environ.get("is_half", "True" if torch.cuda.is_available() else "False")
65
- )
 
66
 
67
  tokenizer = AutoTokenizer.from_pretrained(bert_path)
68
  bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
@@ -187,63 +200,17 @@ def get_spepc(hps, filename):
187
 
188
 
189
  dict_language = {
190
- ("中文1"): "all_zh",#全部按中文识别
191
- ("English"): "en",#全部按英文识别#######不变
192
- ("日文1"): "all_ja",#全部按日文识别
193
- ("中文"): "zh",#按中英混合识别####不变
194
- ("日本語"): "ja",#按日英混合识别####不变
195
- ("混合"): "auto",#多语种启动切分识别语种
196
  }
197
 
198
 
199
- def splite_en_inf(sentence, language):
200
- pattern = re.compile(r'[a-zA-Z ]+')
201
- textlist = []
202
- langlist = []
203
- pos = 0
204
- for match in pattern.finditer(sentence):
205
- start, end = match.span()
206
- if start > pos:
207
- textlist.append(sentence[pos:start])
208
- langlist.append(language)
209
- textlist.append(sentence[start:end])
210
- langlist.append("en")
211
- pos = end
212
- if pos < len(sentence):
213
- textlist.append(sentence[pos:])
214
- langlist.append(language)
215
- # Merge punctuation into previous word
216
- for i in range(len(textlist)-1, 0, -1):
217
- if re.match(r'^[\W_]+$', textlist[i]):
218
- textlist[i-1] += textlist[i]
219
- del textlist[i]
220
- del langlist[i]
221
- # Merge consecutive words with the same language tag
222
- i = 0
223
- while i < len(langlist) - 1:
224
- if langlist[i] == langlist[i+1]:
225
- textlist[i] += textlist[i+1]
226
- del textlist[i+1]
227
- del langlist[i+1]
228
- else:
229
- i += 1
230
-
231
- return textlist, langlist
232
-
233
-
234
  def clean_text_inf(text, language):
235
- formattext = ""
236
- language = language.replace("all_","")
237
- for tmp in LangSegment.getTexts(text):
238
- if language == "ja":
239
- if tmp["lang"] == language or tmp["lang"] == "zh":
240
- formattext += tmp["text"] + " "
241
- continue
242
- if tmp["lang"] == language:
243
- formattext += tmp["text"] + " "
244
- while " " in formattext:
245
- formattext = formattext.replace(" ", " ")
246
- phones, word2ph, norm_text = clean_text(formattext, language)
247
  phones = cleaned_text_to_sequence(phones)
248
  return phones, word2ph, norm_text
249
 
@@ -261,57 +228,6 @@ def get_bert_inf(phones, word2ph, norm_text, language):
261
  return bert
262
 
263
 
264
- def nonen_clean_text_inf(text, language):
265
- if(language!="auto"):
266
- textlist, langlist = splite_en_inf(text, language)
267
- else:
268
- textlist=[]
269
- langlist=[]
270
- for tmp in LangSegment.getTexts(text):
271
- langlist.append(tmp["lang"])
272
- textlist.append(tmp["text"])
273
- print(textlist)
274
- print(langlist)
275
- phones_list = []
276
- word2ph_list = []
277
- norm_text_list = []
278
- for i in range(len(textlist)):
279
- lang = langlist[i]
280
- phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
281
- phones_list.append(phones)
282
- if lang == "zh":
283
- word2ph_list.append(word2ph)
284
- norm_text_list.append(norm_text)
285
- print(word2ph_list)
286
- phones = sum(phones_list, [])
287
- word2ph = sum(word2ph_list, [])
288
- norm_text = ' '.join(norm_text_list)
289
-
290
- return phones, word2ph, norm_text
291
-
292
-
293
- def nonen_get_bert_inf(text, language):
294
- if(language!="auto"):
295
- textlist, langlist = splite_en_inf(text, language)
296
- else:
297
- textlist=[]
298
- langlist=[]
299
- for tmp in LangSegment.getTexts(text):
300
- langlist.append(tmp["lang"])
301
- textlist.append(tmp["text"])
302
- print(textlist)
303
- print(langlist)
304
- bert_list = []
305
- for i in range(len(textlist)):
306
- lang = langlist[i]
307
- phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
308
- bert = get_bert_inf(phones, word2ph, norm_text, lang)
309
- bert_list.append(bert)
310
- bert = torch.cat(bert_list, dim=1)
311
-
312
- return bert
313
-
314
-
315
  splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", }
316
 
317
 
@@ -321,23 +237,63 @@ def get_first(text):
321
  return text
322
 
323
 
324
- def get_cleaned_text_final(text,language):
325
  if language in {"en","all_zh","all_ja"}:
326
- phones, word2ph, norm_text = clean_text_inf(text, language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  elif language in {"zh", "ja","auto"}:
328
- phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
329
- return phones, word2ph, norm_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
- def get_bert_final(phones, word2ph, text,language,device):
332
- if language == "en":
333
- bert = get_bert_inf(phones, word2ph, text, language)
334
- elif language in {"zh", "ja","auto"}:
335
- bert = nonen_get_bert_inf(text, language)
336
- elif language == "all_zh":
337
- bert = get_bert_feature(text, word2ph).to(device)
338
- else:
339
- bert = torch.zeros((1024, len(phones))).to(device)
340
- return bert
341
 
342
  def merge_short_text_in_array(texts, threshold):
343
  if (len(texts)) < 2:
@@ -356,108 +312,100 @@ def merge_short_text_in_array(texts, threshold):
356
  result[len(result) - 1] += text
357
  return result
358
 
359
-
360
- def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=("Do not split"), volume_scale=1.0):
361
- if not duration(ref_wav_path):
362
- return None
363
- if text == '':
364
- wprint("Please enter text to generate/请输入生成文字")
365
- return None
366
  t0 = ttime()
367
- startTime=timer()
368
- text=trim_text(text,text_language)
369
- change_sovits_weights(sovits_path)
370
- tprint(f'🏕️LOADED SoVITS Model: {sovits_path}')
371
- change_gpt_weights(gpt_path)
372
- tprint(f'🏕️LOADED GPT Model: {gpt_path}')
373
-
374
  prompt_language = dict_language[prompt_language]
375
- try:
376
- text_language = dict_language[text_language]
377
- except KeyError as e:
378
- wprint(f"Unsupported language type: {e}")
379
- return None
380
-
381
- prompt_text = prompt_text.strip("\n")
382
- if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
383
  text = text.strip("\n")
 
384
  if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
385
- #print(("实际输入的参考文本:"), prompt_text)
386
- #print(("📝实际输入的目标文本:"), text)
387
  zero_wav = np.zeros(
388
  int(hps.data.sampling_rate * 0.3),
389
  dtype=np.float16 if is_half == True else np.float32,
390
  )
391
- with torch.no_grad():
392
- wav16k, sr = librosa.load(ref_wav_path, sr=16000)
393
- if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000):
394
- errinfo='参考音频在3~10秒范围外,请更换!'
395
- raise OSError((errinfo))
396
- wav16k = torch.from_numpy(wav16k)
397
- zero_wav_torch = torch.from_numpy(zero_wav)
398
- if is_half == True:
399
- wav16k = wav16k.half().to(device)
400
- zero_wav_torch = zero_wav_torch.half().to(device)
401
- else:
402
- wav16k = wav16k.to(device)
403
- zero_wav_torch = zero_wav_torch.to(device)
404
- wav16k = torch.cat([wav16k, zero_wav_torch])
405
- ssl_content = ssl_model.model(wav16k.unsqueeze(0))[
406
- "last_hidden_state"
407
- ].transpose(
408
- 1, 2
409
- ) # .float()
410
- codes = vq_model.extract_latent(ssl_content)
411
- prompt_semantic = codes[0, 0]
412
- t1 = ttime()
413
 
414
- phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language)
415
 
416
- if (how_to_cut == ("Split into groups of 4 sentences")):
417
  text = cut1(text)
418
- elif (how_to_cut == ("Split every 50 characters")):
419
  text = cut2(text)
420
- elif (how_to_cut == ("Split at CN/JP periods (。)")):
421
  text = cut3(text)
422
- elif (how_to_cut == ("Split at English periods (.)")):
423
  text = cut4(text)
424
- elif (how_to_cut == ("Split at punctuation marks")):
425
  text = cut5(text)
426
  while "\n\n" in text:
427
  text = text.replace("\n\n", "\n")
428
- print(f"🧨实际输入的目标文本(切句后):{text}\n")
429
  texts = text.split("\n")
 
430
  texts = merge_short_text_in_array(texts, 5)
431
  audio_opt = []
432
- bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype)
 
433
 
434
  for text in texts:
 
435
  if (len(text.strip()) == 0):
436
  continue
437
  if (text[-1] not in splits): text += "。" if text_language != "en" else "."
438
- print(("\n🎈实际输入的目标文本(每句):"), text)
439
- phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
440
- try:
441
- bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
442
- except RuntimeError as e:
443
- wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
444
- return None
445
- bert = torch.cat([bert1, bert2], 1)
 
446
 
447
- all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
448
  bert = bert.to(device).unsqueeze(0)
449
  all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
450
- prompt = prompt_semantic.unsqueeze(0).to(device)
451
  t2 = ttime()
452
  with torch.no_grad():
453
  # pred_semantic = t2s_model.model.infer(
454
  pred_semantic, idx = t2s_model.model.infer_panel(
455
  all_phoneme_ids,
456
  all_phoneme_len,
457
- prompt,
458
  bert,
459
  # prompt_phone_len=ph_offset,
460
- top_k=config["inference"]["top_k"],
 
 
461
  early_stop_num=hz * max_sec,
462
  )
463
  t3 = ttime()
@@ -471,34 +419,24 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
471
  else:
472
  refer = refer.to(device)
473
  # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
474
- try:
475
- audio = (
476
  vq_model.decode(
477
  pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
478
  )
479
  .detach()
480
  .cpu()
481
  .numpy()[0, 0]
482
- )
483
- except RuntimeError as e:
484
- wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
485
- return None
486
-
487
- max_audio=np.abs(audio).max()
488
  if max_audio>1:audio/=max_audio
489
  audio_opt.append(audio)
490
  audio_opt.append(zero_wav)
491
  t4 = ttime()
492
  print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
493
- #yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
494
- audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
495
-
496
- audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
497
- output_wav = "output_audio.wav"
498
- sf.write(output_wav, audio_data, hps.data.sampling_rate)
499
- endTime=timer()
500
- tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
501
- return output_wav
502
 
503
  def split(todo_text):
504
  todo_text = todo_text.replace("……", "。").replace("——", ",")
@@ -509,7 +447,7 @@ def split(todo_text):
509
  todo_texts = []
510
  while 1:
511
  if i_split_head >= len_text:
512
- break
513
  if todo_text[i_split_head] in splits:
514
  i_split_head += 1
515
  todo_texts.append(todo_text[i_split_tail:i_split_head])
@@ -530,6 +468,7 @@ def cut1(inp):
530
  opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
531
  else:
532
  opts = [inp]
 
533
  return "\n".join(opts)
534
 
535
 
@@ -551,35 +490,49 @@ def cut2(inp):
551
  if tmp_str != "":
552
  opts.append(tmp_str)
553
  # print(opts)
554
- if len(opts) > 1 and len(opts[-1]) < 50:
555
  opts[-2] = opts[-2] + opts[-1]
556
  opts = opts[:-1]
 
557
  return "\n".join(opts)
558
 
559
 
560
  def cut3(inp):
561
  inp = inp.strip("\n")
562
- return "\n".join(["%s" % item for item in inp.strip("。").split("。")])
563
-
 
564
 
565
  def cut4(inp):
566
  inp = inp.strip("\n")
567
- return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
 
 
568
 
569
 
570
  # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
571
  def cut5(inp):
572
- # if not re.search(r'[^\w\s]', inp[-1]):
573
- # inp += '。'
574
  inp = inp.strip("\n")
575
- punds = r'[,.;?!、,。?!;:…]'
576
- items = re.split(f'({punds})', inp)
577
- mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
578
- if len(items)%2 == 1:
579
- mergeitems.append(items[-1])
580
- opt = "\n".join(mergeitems)
581
- return opt
 
 
 
 
 
 
 
582
 
 
 
 
 
 
583
 
584
 
585
  def custom_sort_key(s):
@@ -589,312 +542,120 @@ def custom_sort_key(s):
589
  parts = [int(part) if part.isdigit() else part for part in parts]
590
  return parts
591
 
592
- #==========custom functions============
593
-
594
- def tprint(text):
595
- now=datetime.now(tz).strftime('%H:%M:%S')
596
- print(f'UTC+8 - {now} - {text}')
597
-
598
- def wprint(text):
599
- tprint(text)
600
- gr.Warning(text)
601
-
602
- def lang_detector(text):
603
- min_chars = 5
604
- if len(text) < min_chars:
605
- return "Input text too short/输入文本太短"
606
- try:
607
- detector = Detector(text).language
608
- lang_info = str(detector)
609
- code = re.search(r"name: (\w+)", lang_info).group(1)
610
- if code == 'Japanese':
611
- return "日本語"
612
- elif code == 'Chinese':
613
- return "中文"
614
- elif code == 'English':
615
- return 'English'
616
  else:
617
- return code
618
- except Exception as e:
619
- return f"ERROR:{str(e)}"
620
-
621
- def trim_text(text,language):
622
- limit_cj = 120 #character
623
- limit_en = 60 #words
624
- search_limit_cj = limit_cj+30
625
- search_limit_en = limit_en +30
626
- text = text.replace('\n', '').strip()
627
-
628
- if language =='English':
629
- words = text.split()
630
- if len(words) <= limit_en:
631
- return text
632
- # English
633
- for i in range(limit_en, -1, -1):
634
- if any(punct in words[i] for punct in splits):
635
- return ' '.join(words[:i+1])
636
- for i in range(limit_en, min(len(words), search_limit_en)):
637
- if any(punct in words[i] for punct in splits):
638
- return ' '.join(words[:i+1])
639
- return ' '.join(words[:limit_en])
640
-
641
- else:#中文日文
642
- if len(text) <= limit_cj:
643
- return text
644
- for i in range(limit_cj, -1, -1):
645
- if text[i] in splits:
646
- return text[:i+1]
647
- for i in range(limit_cj, min(len(text), search_limit_cj)):
648
- if text[i] in splits:
649
- return text[:i+1]
650
- return text[:limit_cj]
651
-
652
- def duration(audio_file_path):
653
- if not audio_file_path:
654
- wprint("Failed to obtain uploaded audio/未找到音频文件")
655
- return False
656
- try:
657
- audio_duration = librosa.get_duration(filename=audio_file_path)
658
- if not 3 < audio_duration < 10:
659
- wprint("The audio length must be between 3~10 seconds/音频时长须在3~10秒之间")
660
- return False
661
- return True
662
- except FileNotFoundError:
663
- return False
664
-
665
- def update_model(choice):
666
- global gpt_path, sovits_path
667
- model_info = models[choice]
668
- gpt_path = abs_path(model_info["gpt_weight"])
669
- sovits_path = abs_path(model_info["sovits_weight"])
670
- model_name = choice
671
- tone_info = model_info["tones"]["tone1"]
672
- tone_sample_path = abs_path(tone_info["sample"])
673
- tprint(f'✅SELECT MODEL:{choice}')
674
- # 返回默认tone“tone1”
675
- return (
676
- tone_info["example_voice_wav"],
677
- tone_info["example_voice_wav_words"],
678
- model_info["default_language"],
679
- model_info["default_language"],
680
- model_name,
681
- "tone1" ,
682
- tone_sample_path
683
- )
684
 
685
- def update_tone(model_choice, tone_choice):
686
- model_info = models[model_choice]
687
- tone_info = model_info["tones"][tone_choice]
688
- example_voice_wav = abs_path(tone_info["example_voice_wav"])
689
- example_voice_wav_words = tone_info["example_voice_wav_words"]
690
- tone_sample_path = abs_path(tone_info["sample"])
691
- return example_voice_wav, example_voice_wav_words,tone_sample_path
692
-
693
- def transcribe(voice):
694
- time1=timer()
695
- tprint('⚡Start Clone - transcribe')
696
- task="transcribe"
697
- if voice is None:
698
- wprint("No audio file submitted! Please upload or record an audio file before submitting your request.")
699
- R = pipe(voice, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True,return_language=True)
700
- text=R['text']
701
- lang=R['chunks'][0]['language']
702
- if lang=='english':
703
- language='English'
704
- elif lang =='chinese':
705
- language='中文'
706
- elif lang=='japanese':
707
- language = '日本語'
708
-
709
- time2=timer()
710
- tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
711
- tprint(f'\nTRANSCRIBE RESULT:\n 🔣Language:{language} \n 🔣Text:{text}' )
712
- return text,language
713
-
714
- def clone_voice(user_voice,user_text,user_lang):
715
- if not duration(user_voice):
716
- return None
717
- if user_text == '':
718
- wprint("Please enter text to generate/请输入生成文字")
719
- return None
720
- user_text=trim_text(user_text,user_lang)
721
- time1=timer()
722
- global gpt_path, sovits_path
723
- gpt_path = abs_path("pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
724
- #tprint(f'Model loaded:{gpt_path}')
725
- sovits_path = abs_path("pretrained_models/s2G488k.pth")
726
- #tprint(f'Model loaded:{sovits_path}')
727
- try:
728
- prompt_text, prompt_language = transcribe(user_voice)
729
- except UnboundLocalError as e:
730
- wprint(f"The language in the audio cannot be recognized :{str(e)}")
731
- return None
732
-
733
- output_wav = get_tts_wav(
734
- user_voice,
735
- prompt_text,
736
- prompt_language,
737
- user_text,
738
- user_lang,
739
- how_to_cut="Do not split",
740
- volume_scale=1.0)
741
- time2=timer()
742
- tprint(f'🆗CLONE COMPLETE,{round(time2-time1,4)}s')
743
- return output_wav
744
-
745
- with open('dummy') as f:
746
- dummy_txt = f.read().strip().splitlines()
747
-
748
- def dice():
749
- return random.choice(dummy_txt), '🎲'
750
-
751
- from info import models
752
- models_by_language = {
753
- "English": [],
754
- "中文": [],
755
- "日本語": []
756
- }
757
- for model_name, model_info in models.items():
758
- language = model_info["default_language"]
759
- models_by_language[language].append((model_name, model_info))
760
-
761
- ##########GRADIO###########
762
-
763
- with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
764
- gr.HTML('''
765
- <h1 style="font-size: 25px;">TEXT TO SPEECH</h1>
766
- <h1 style="font-size: 20px;">Support English/Chinese/Japanese</h1>
767
- <p style="margin-bottom: 10px; font-size: 100%">
768
- If you like this space, please click the ❤️ at the top of the page..如喜欢,请点一下页面顶部的❤️<br>
769
- </p>''')
770
-
771
- gr.Markdown("""* This space is based on the text-to-speech generation solution [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) .
772
- You can visit the repo's github homepage to learn training and inference.<br>
773
- 本空间基于文字转语音生成方案 [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS). 你可以前往项目的github主页学习如何推理和训练。
774
- * ⚠️Generating voice is very slow due to using HuggingFace's free CPU in this space.
775
- For faster generation, click the Colab icon below to use this space in Colab,
776
- which will significantly improve the speed.<br>
777
- 由于本空间使用huggingface的免费CPU进行推理,因此速度很慢,如想快速生成,请点击下方的Colab图标,
778
- 前往Colab使用已获得更快的生成速度。
779
- <br>Colabの使用を強くお勧めします。より速い生成速度が得られます。
780
- * each model can speak three languages.<br>每个模型都能说三种语言<br>各モデルは3つの言語を話すことができます。""")
781
- gr.HTML('''<a href="https://colab.research.google.com/drive/1fTuPZ4tZsAjS-TrhQWMCb7KRdnU8aF6j" target="_blank"><img src="https://camo.githubusercontent.com/dd83d4a334eab7ada034c13747d9e2237182826d32e3fda6629740b6e02f18d8/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6c61622d4639414230303f7374796c653d666f722d7468652d6261646765266c6f676f3d676f6f676c65636f6c616226636f6c6f723d353235323532" alt="colab"></a>
782
- ''')
783
-
784
- default_voice_wav, default_voice_wav_words, default_language, _, default_model_name, _, default_tone_sample_path = update_model("Trump")
785
- english_models = [name for name, _ in models_by_language["English"]]
786
- chinese_models = [name for name, _ in models_by_language["中文"]]
787
- japanese_models = [name for name, _ in models_by_language["日本語"]]
788
- with gr.Row():
789
- english_choice = gr.Radio(english_models, label="EN",value="Trump",scale=3)
790
- chinese_choice = gr.Radio(chinese_models, label="ZH",scale=2)
791
- japanese_choice = gr.Radio(japanese_models, label="JA",scale=4)
792
-
793
- plsh='''
794
- Support【English/中文/日本語】,Input text here / 在这輸入文字 /ここにテキストを入力する。
795
-
796
- If you don't know what to input, you can click the dice on the right, and random text will appear.
797
- 如果你不知道输入什么,可以点击右边的骰子,会出现随机文本。
798
- 入力するものがわからない場合は、右側のサイコロをクリックすると、ランダムなテキストが表示されます。
799
-
800
- '''
801
- limit='Max 70 words. Excess will be ignored./单次最多处理120字左右,多余的会被忽略'
802
-
803
- gr.HTML('''
804
- <b>Input Text/输入文字</b>''')
805
- with gr.Row():
806
- with gr.Column(scale=2):
807
- model_name = gr.Textbox(label="Seleted Model/已选模型", value=default_model_name, interactive=False,scale=1,)
808
- text_language = gr.Textbox(
809
- label="Language for input text/生成语言",
810
- info='Automatic detection of input language type.',scale=1,interactive=False
811
- )
812
- text = gr.Textbox(label="INPUT TEXT", lines=5,placeholder=plsh,info=limit,scale=10,min_width=0)
813
- ddice= gr.Button('🎲', variant='tool',min_width=0,scale=0)
814
-
815
- ddice.click(dice, outputs=[text, ddice])
816
- text.change( lang_detector, text, text_language)
817
-
818
-
819
- with gr.Row():
820
- with gr.Column(scale=2):
821
- tone_select = gr.Radio(
822
- label="Select Tone/选择语气",
823
- choices=["tone1","tone2","tone3"],
824
- value="tone1",
825
- info='Tone influences the emotional expression ',scale=1)
826
- tone_sample=gr.Audio(label="🔊Preview tone/试听语气 ", scale=8)
827
-
828
-
829
- with gr.Accordion(label="prpt voice", open=False,visible=False):
830
- with gr.Row(visible=True):
831
- inp_ref = gr.Audio(label="Reference audio", type="filepath", value=default_voice_wav, scale=3)
832
- prompt_text = gr.Textbox(label="Reference text", value=default_voice_wav_words, scale=3)
833
- prompt_language = gr.Dropdown(label="Language of the reference audio", choices=["中文", "English", "日本語"], value=default_language, scale=1,interactive=False)
834
- dummy = gr.Radio(choices=["中文","English","日本語"],visible=False)
835
-
836
-
837
- with gr.Accordion(label="Additional generation options/附加生成选项", open=False):
838
- how_to_cut = gr.Dropdown(
839
- label=("How to split?"),
840
- choices=[("Do not split"), ("Split into groups of 4 sentences"), ("Split every 50 characters"),
841
- ("Split at CN/JP periods (。)"), ("Split at English periods (.)"), ("Split at punctuation marks"), ],
842
- value=("Split into groups of 4 sentences"),
843
- interactive=True,
844
- info='A suitable splitting method can achieve better generation results'
845
- )
846
- volume = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.01, label='Volume/音量')
847
-
848
-
849
- gr.HTML('''
850
- <b>Generate Voice/生成</b>''')
851
- with gr.Row():
852
- main_button = gr.Button("✨Generate Voice", variant="primary", scale=2)
853
- output = gr.Audio(label="💾Download it by clicking ⬇️", scale=6)
854
- #info = gr.Textbox(label="INFO", visible=True, readonly=True, scale=1)
855
-
856
- gr.HTML('''
857
- Generation is slower, please be patient and wait/合成比较慢,请耐心等待<br>
858
- If it generated silence, please try again./如果生成了空白声音,请重试
859
- <br><br><br><br>
860
- <h1 style="font-size: 25px;">Clone custom Voice/克隆自定义声音</h1>
861
- <p style="margin-bottom: 10px; font-size: 100%">
862
- 需要3~10秒语音,克隆后的声音和原音相似度80%以上<br>
863
- Requires 3-10 seconds of voice input. The cloned voice will have a similarity of 80% or above compared to the original.<br>
864
- 3~10秒の音声入力が必要です。クローンされた音声は、オリジナルと80%以上の類似性があります。
865
 
866
-
867
- </p>''')
868
-
869
- with gr.Row():
870
- user_voice = gr.Audio(type="filepath", label="(3~10s)Upload or Record audio/上传或录制声音",scale=3)
871
- with gr.Column(scale=7):
872
- user_lang = gr.Textbox(label="Language/生成语言",info='Automatic detection of input language type.',interactive=False)
873
- with gr.Row():
874
- user_text= gr.Textbox(label="Text for generation/输入想要生成语音的文字", lines=5,placeholder=plsh,info=limit)
875
- dddice= gr.Button('🎲', variant='tool',min_width=0,scale=0)
876
-
877
- dddice.click(dice, outputs=[user_text, dddice])
878
- user_text.change( lang_detector, user_text, user_lang)
879
 
880
- user_button = gr.Button("✨Clone Voice", variant="primary")
881
- user_output = gr.Audio(label="💾Download it by clicking ⬇️")
882
 
883
- gr.HTML('''<div align=center><img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.laobi.icu/badge?page_id=Ailyth/DLMP9" /></div>''')
884
-
885
- english_choice.change(update_model, inputs=[english_choice], outputs=[inp_ref, prompt_text, prompt_language,dummy,model_name, tone_select, tone_sample])
886
- chinese_choice.change(update_model, inputs=[chinese_choice], outputs=[inp_ref, prompt_text, prompt_language, dummy,model_name, tone_select, tone_sample])
887
- japanese_choice.change(update_model, inputs=[japanese_choice], outputs=[inp_ref, prompt_text, prompt_language,dummy,model_name, tone_select, tone_sample])
888
- tone_select.change(update_tone, inputs=[model_name, tone_select], outputs=[inp_ref, prompt_text, tone_sample])
889
-
890
- main_button.click(
891
- get_tts_wav,
892
- inputs=[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut,volume],
893
- outputs=[output])
894
 
895
- user_button.click(
896
- clone_voice,
897
- inputs=[user_voice,user_text,user_lang],
898
- outputs=[user_output])
899
 
900
- app.launch(share=True, show_api=False).queue(api_open=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ 按中英混合识别
3
+ 按日英混合识别
4
+ 多语种启动切分识别语种
5
+ 全部按中文识别
6
+ 全部按英文识别
7
+ 全部按日文识别
8
+ '''
9
+ import os, re, logging
10
+ import LangSegment
 
 
 
 
 
 
 
 
 
 
 
11
  logging.getLogger("markdown_it").setLevel(logging.ERROR)
12
  logging.getLogger("urllib3").setLevel(logging.ERROR)
13
  logging.getLogger("httpcore").setLevel(logging.ERROR)
 
15
  logging.getLogger("asyncio").setLevel(logging.ERROR)
16
  logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
17
  logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
18
+ import pdb
19
+ import torch
20
+
21
+ if os.path.exists("./gweight.txt"):
22
+ with open("./gweight.txt", 'r', encoding="utf-8") as file:
23
+ gweight_data = file.read()
24
+ gpt_path = os.environ.get(
25
+ "gpt_path", gweight_data)
26
+ else:
27
+ gpt_path = os.environ.get(
28
+ "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
29
 
30
+ if os.path.exists("./sweight.txt"):
31
+ with open("./sweight.txt", 'r', encoding="utf-8") as file:
32
+ sweight_data = file.read()
33
+ sovits_path = os.environ.get("sovits_path", sweight_data)
34
+ else:
35
+ sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
36
+ # gpt_path = os.environ.get(
37
+ # "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
38
+ # )
39
+ # sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth")
40
+ cnhubert_base_path = os.environ.get(
41
+ "cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base"
42
+ )
43
+ bert_path = os.environ.get(
44
+ "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
45
+ )
46
+ infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
47
+ infer_ttswebui = int(infer_ttswebui)
48
+ is_share = os.environ.get("is_share", "False")
49
+ is_share = eval(is_share)
50
  if "_CUDA_VISIBLE_DEVICES" in os.environ:
51
  os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
52
+ is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
53
+ punctuation = set(['!', '?', '…', ',', '.', '-'," "])
54
+ import gradio as gr
55
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
56
+ import numpy as np
57
+ import librosa
58
+ from feature_extractor import cnhubert
59
+
 
 
 
 
 
 
 
60
  cnhubert.cnhubert_base_path = cnhubert_base_path
61
 
62
+ from module.models import SynthesizerTrn
63
+ from AR.models.t2s_lightning_module import Text2SemanticLightningModule
64
+ from text import cleaned_text_to_sequence
65
+ from text.cleaner import clean_text
66
+ from time import time as ttime
67
+ from module.mel_processing import spectrogram_torch
68
+ from tools.my_utils import load_audio
69
+ from tools.i18n.i18n import I18nAuto
70
 
71
+ i18n = I18nAuto()
 
 
 
 
72
 
73
+ # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
74
 
75
+ if torch.cuda.is_available():
76
+ device = "cuda"
77
+ else:
78
+ device = "cpu"
79
 
80
  tokenizer = AutoTokenizer.from_pretrained(bert_path)
81
  bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
 
200
 
201
 
202
  dict_language = {
203
+ i18n("中文"): "all_zh",#全部按中文识别
204
+ i18n("英文"): "en",#全部按英文识别#######不变
205
+ i18n("日文"): "all_ja",#全部按日文识别
206
+ i18n("中英混合"): "zh",#按中英混合识别####不变
207
+ i18n("日英混合"): "ja",#按日英混合识别####不变
208
+ i18n("多语种混合"): "auto",#多语种启动切分识别语种
209
  }
210
 
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def clean_text_inf(text, language):
213
+ phones, word2ph, norm_text = clean_text(text, language)
 
 
 
 
 
 
 
 
 
 
 
214
  phones = cleaned_text_to_sequence(phones)
215
  return phones, word2ph, norm_text
216
 
 
228
  return bert
229
 
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", }
232
 
233
 
 
237
  return text
238
 
239
 
240
+ def get_phones_and_bert(text,language):
241
  if language in {"en","all_zh","all_ja"}:
242
+ language = language.replace("all_","")
243
+ if language == "en":
244
+ LangSegment.setfilters(["en"])
245
+ formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
246
+ else:
247
+ # 因无法区别中日文汉字,以用户输入为准
248
+ formattext = text
249
+ while " " in formattext:
250
+ formattext = formattext.replace(" ", " ")
251
+ phones, word2ph, norm_text = clean_text_inf(formattext, language)
252
+ if language == "zh":
253
+ bert = get_bert_feature(norm_text, word2ph).to(device)
254
+ else:
255
+ bert = torch.zeros(
256
+ (1024, len(phones)),
257
+ dtype=torch.float16 if is_half == True else torch.float32,
258
+ ).to(device)
259
  elif language in {"zh", "ja","auto"}:
260
+ textlist=[]
261
+ langlist=[]
262
+ LangSegment.setfilters(["zh","ja","en","ko"])
263
+ if language == "auto":
264
+ for tmp in LangSegment.getTexts(text):
265
+ if tmp["lang"] == "ko":
266
+ langlist.append("zh")
267
+ textlist.append(tmp["text"])
268
+ else:
269
+ langlist.append(tmp["lang"])
270
+ textlist.append(tmp["text"])
271
+ else:
272
+ for tmp in LangSegment.getTexts(text):
273
+ if tmp["lang"] == "en":
274
+ langlist.append(tmp["lang"])
275
+ else:
276
+ # 因无法区别中日文汉字,以用户输入为准
277
+ langlist.append(language)
278
+ textlist.append(tmp["text"])
279
+ print(textlist)
280
+ print(langlist)
281
+ phones_list = []
282
+ bert_list = []
283
+ norm_text_list = []
284
+ for i in range(len(textlist)):
285
+ lang = langlist[i]
286
+ phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
287
+ bert = get_bert_inf(phones, word2ph, norm_text, lang)
288
+ phones_list.append(phones)
289
+ norm_text_list.append(norm_text)
290
+ bert_list.append(bert)
291
+ bert = torch.cat(bert_list, dim=1)
292
+ phones = sum(phones_list, [])
293
+ norm_text = ''.join(norm_text_list)
294
+
295
+ return phones,bert.to(dtype),norm_text
296
 
 
 
 
 
 
 
 
 
 
 
297
 
298
  def merge_short_text_in_array(texts, threshold):
299
  if (len(texts)) < 2:
 
312
  result[len(result) - 1] += text
313
  return result
314
 
315
+ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False):
316
+ if prompt_text is None or len(prompt_text) == 0:
317
+ ref_free = True
 
 
 
 
318
  t0 = ttime()
 
 
 
 
 
 
 
319
  prompt_language = dict_language[prompt_language]
320
+ text_language = dict_language[text_language]
321
+ if not ref_free:
322
+ prompt_text = prompt_text.strip("\n")
323
+ if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
324
+ print(i18n("实际输入的参考文本:"), prompt_text)
 
 
 
325
  text = text.strip("\n")
326
+ text = replace_consecutive_punctuation(text)
327
  if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
328
+
329
+ print(i18n("实际输入的目标文本:"), text)
330
  zero_wav = np.zeros(
331
  int(hps.data.sampling_rate * 0.3),
332
  dtype=np.float16 if is_half == True else np.float32,
333
  )
334
+ if not ref_free:
335
+ with torch.no_grad():
336
+ wav16k, sr = librosa.load(ref_wav_path, sr=16000)
337
+ if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000):
338
+ raise OSError(i18n("参考音频在3~10秒范围外,请更换!"))
339
+ wav16k = torch.from_numpy(wav16k)
340
+ zero_wav_torch = torch.from_numpy(zero_wav)
341
+ if is_half == True:
342
+ wav16k = wav16k.half().to(device)
343
+ zero_wav_torch = zero_wav_torch.half().to(device)
344
+ else:
345
+ wav16k = wav16k.to(device)
346
+ zero_wav_torch = zero_wav_torch.to(device)
347
+ wav16k = torch.cat([wav16k, zero_wav_torch])
348
+ ssl_content = ssl_model.model(wav16k.unsqueeze(0))[
349
+ "last_hidden_state"
350
+ ].transpose(
351
+ 1, 2
352
+ ) # .float()
353
+ codes = vq_model.extract_latent(ssl_content)
354
+ prompt_semantic = codes[0, 0]
355
+ prompt = prompt_semantic.unsqueeze(0).to(device)
356
 
357
+ t1 = ttime()
358
 
359
+ if (how_to_cut == i18n("凑四句一切")):
360
  text = cut1(text)
361
+ elif (how_to_cut == i18n("50字一切")):
362
  text = cut2(text)
363
+ elif (how_to_cut == i18n("按中文句号。切")):
364
  text = cut3(text)
365
+ elif (how_to_cut == i18n("按英文句号.切")):
366
  text = cut4(text)
367
+ elif (how_to_cut == i18n("按标点符号切")):
368
  text = cut5(text)
369
  while "\n\n" in text:
370
  text = text.replace("\n\n", "\n")
371
+ print(i18n("实际输入的目标文本(切句后):"), text)
372
  texts = text.split("\n")
373
+ texts = process_text(texts)
374
  texts = merge_short_text_in_array(texts, 5)
375
  audio_opt = []
376
+ if not ref_free:
377
+ phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language)
378
 
379
  for text in texts:
380
+ # 解决输入目标文本的空行导致报错的问题
381
  if (len(text.strip()) == 0):
382
  continue
383
  if (text[-1] not in splits): text += "。" if text_language != "en" else "."
384
+ print(i18n("实际输入的目标文本(每句):"), text)
385
+ phones2,bert2,norm_text2=get_phones_and_bert(text, text_language)
386
+ print(i18n("前端处理后的文本(每句):"), norm_text2)
387
+ if not ref_free:
388
+ bert = torch.cat([bert1, bert2], 1)
389
+ all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
390
+ else:
391
+ bert = bert2
392
+ all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0)
393
 
 
394
  bert = bert.to(device).unsqueeze(0)
395
  all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
396
+
397
  t2 = ttime()
398
  with torch.no_grad():
399
  # pred_semantic = t2s_model.model.infer(
400
  pred_semantic, idx = t2s_model.model.infer_panel(
401
  all_phoneme_ids,
402
  all_phoneme_len,
403
+ None if ref_free else prompt,
404
  bert,
405
  # prompt_phone_len=ph_offset,
406
+ top_k=top_k,
407
+ top_p=top_p,
408
+ temperature=temperature,
409
  early_stop_num=hz * max_sec,
410
  )
411
  t3 = ttime()
 
419
  else:
420
  refer = refer.to(device)
421
  # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
422
+ audio = (
 
423
  vq_model.decode(
424
  pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
425
  )
426
  .detach()
427
  .cpu()
428
  .numpy()[0, 0]
429
+ ) ###试试重建不带上prompt部分
430
+ max_audio=np.abs(audio).max()#简单防止16bit爆音
 
 
 
 
431
  if max_audio>1:audio/=max_audio
432
  audio_opt.append(audio)
433
  audio_opt.append(zero_wav)
434
  t4 = ttime()
435
  print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
436
+ yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(
437
+ np.int16
438
+ )
439
+
 
 
 
 
 
440
 
441
  def split(todo_text):
442
  todo_text = todo_text.replace("……", "。").replace("——", ",")
 
447
  todo_texts = []
448
  while 1:
449
  if i_split_head >= len_text:
450
+ break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入
451
  if todo_text[i_split_head] in splits:
452
  i_split_head += 1
453
  todo_texts.append(todo_text[i_split_tail:i_split_head])
 
468
  opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
469
  else:
470
  opts = [inp]
471
+ opts = [item for item in opts if not set(item).issubset(punctuation)]
472
  return "\n".join(opts)
473
 
474
 
 
490
  if tmp_str != "":
491
  opts.append(tmp_str)
492
  # print(opts)
493
+ if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起
494
  opts[-2] = opts[-2] + opts[-1]
495
  opts = opts[:-1]
496
+ opts = [item for item in opts if not set(item).issubset(punctuation)]
497
  return "\n".join(opts)
498
 
499
 
500
  def cut3(inp):
501
  inp = inp.strip("\n")
502
+ opts = ["%s" % item for item in inp.strip("。").split("。")]
503
+ opts = [item for item in opts if not set(item).issubset(punctuation)]
504
+ return "\n".join(opts)
505
 
506
  def cut4(inp):
507
  inp = inp.strip("\n")
508
+ opts = ["%s" % item for item in inp.strip(".").split(".")]
509
+ opts = [item for item in opts if not set(item).issubset(punctuation)]
510
+ return "\n".join(opts)
511
 
512
 
513
  # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
514
  def cut5(inp):
 
 
515
  inp = inp.strip("\n")
516
+ punds = {',', '.', ';', '?', '!', '、', ',', '。', '?', '!', ';', ':', '…'}
517
+ mergeitems = []
518
+ items = []
519
+
520
+ for i, char in enumerate(inp):
521
+ if char in punds:
522
+ if char == '.' and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit():
523
+ items.append(char)
524
+ else:
525
+ items.append(char)
526
+ mergeitems.append("".join(items))
527
+ items = []
528
+ else:
529
+ items.append(char)
530
 
531
+ if items:
532
+ mergeitems.append("".join(items))
533
+
534
+ opt = [item for item in mergeitems if not set(item).issubset(punds)]
535
+ return "\n".join(opt)
536
 
537
 
538
  def custom_sort_key(s):
 
542
  parts = [int(part) if part.isdigit() else part for part in parts]
543
  return parts
544
 
545
+ def process_text(texts):
546
+ _text=[]
547
+ if all(text in [None, " ", "\n",""] for text in texts):
548
+ raise ValueError(i18n("请输入有效文���"))
549
+ for text in texts:
550
+ if text in [None, " ", ""]:
551
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
  else:
553
+ _text.append(text)
554
+ return _text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
 
557
+ def replace_consecutive_punctuation(text):
558
+ punctuations = ''.join(re.escape(p) for p in punctuation)
559
+ pattern = f'([{punctuations}])([{punctuations}])+'
560
+ result = re.sub(pattern, r'\1', text)
561
+ return result
 
 
 
 
 
 
 
 
562
 
 
 
563
 
564
+ def change_choices():
565
+ SoVITS_names, GPT_names = get_weights_names()
566
+ return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"}
 
 
 
 
 
 
 
 
567
 
 
 
 
 
568
 
569
+ pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
570
+ pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
571
+ SoVITS_weight_root = "SoVITS_weights"
572
+ GPT_weight_root = "GPT_weights"
573
+ os.makedirs(SoVITS_weight_root, exist_ok=True)
574
+ os.makedirs(GPT_weight_root, exist_ok=True)
575
+
576
+
577
+ def get_weights_names():
578
+ SoVITS_names = [pretrained_sovits_name]
579
+ for name in os.listdir(SoVITS_weight_root):
580
+ if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name))
581
+ GPT_names = [pretrained_gpt_name]
582
+ for name in os.listdir(GPT_weight_root):
583
+ if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (GPT_weight_root, name))
584
+ return SoVITS_names, GPT_names
585
+
586
+
587
+ SoVITS_names, GPT_names = get_weights_names()
588
+
589
+ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
590
+ gr.Markdown(
591
+ value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
592
+ )
593
+ with gr.Group():
594
+ gr.Markdown(value=i18n("模型切换"))
595
+ with gr.Row():
596
+ GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True)
597
+ SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True)
598
+ refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
599
+ refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
600
+ SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], [])
601
+ GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
602
+ gr.Markdown(value=i18n("*请上传并填写参考信息"))
603
+ with gr.Row():
604
+ inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath")
605
+ with gr.Column():
606
+ ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
607
+ gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。"))
608
+ prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
609
+ prompt_language = gr.Dropdown(
610
+ label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
611
+ )
612
+ gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
613
+ with gr.Row():
614
+ text = gr.Textbox(label=i18n("需要合成的文本"), value="")
615
+ text_language = gr.Dropdown(
616
+ label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
617
+ )
618
+ how_to_cut = gr.Radio(
619
+ label=i18n("怎么切"),
620
+ choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
621
+ value=i18n("凑四句一切"),
622
+ interactive=True,
623
+ )
624
+ with gr.Row():
625
+ gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低):"))
626
+ top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
627
+ top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
628
+ temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
629
+ inference_button = gr.Button(i18n("合成语音"), variant="primary")
630
+ output = gr.Audio(label=i18n("输出的语音"))
631
+
632
+ inference_button.click(
633
+ get_tts_wav,
634
+ [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free],
635
+ [output],
636
+ )
637
+
638
+ gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
639
+ with gr.Row():
640
+ text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
641
+ button1 = gr.Button(i18n("凑四句一切"), variant="primary")
642
+ button2 = gr.Button(i18n("凑50字一切"), variant="primary")
643
+ button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
644
+ button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
645
+ button5 = gr.Button(i18n("按标点符号切"), variant="primary")
646
+ text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
647
+ button1.click(cut1, [text_inp], [text_opt])
648
+ button2.click(cut2, [text_inp], [text_opt])
649
+ button3.click(cut3, [text_inp], [text_opt])
650
+ button4.click(cut4, [text_inp], [text_opt])
651
+ button5.click(cut5, [text_inp], [text_opt])
652
+ gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
653
+
654
+ if __name__ == '__main__':
655
+ app.queue(concurrency_count=511, max_size=1022).launch(
656
+ server_name="0.0.0.0",
657
+ inbrowser=True,
658
+ share=is_share,
659
+ server_port=infer_ttswebui,
660
+ quiet=True,
661
+ )