0220-152614-some_fix
Browse files- app.py +38 -31
- text/chinese.py +2 -2
- text/english.py +4 -4
app.py
CHANGED
@@ -1,13 +1,3 @@
|
|
1 |
-
import logging
|
2 |
-
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
3 |
-
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
4 |
-
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
5 |
-
logging.getLogger("httpx").setLevel(logging.ERROR)
|
6 |
-
logging.getLogger("asyncio").setLevel(logging.ERROR)
|
7 |
-
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
8 |
-
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
9 |
-
logging.getLogger("multipart").setLevel(logging.WARNING)
|
10 |
-
|
11 |
import gradio as gr
|
12 |
import numpy as np
|
13 |
import soundfile as sf
|
@@ -26,6 +16,18 @@ from transformers.pipelines.audio_utils import ffmpeg_read
|
|
26 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
27 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
30 |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
31 |
tz = pytz.timezone('Asia/Singapore')
|
@@ -365,9 +367,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
365 |
startTime=timer()
|
366 |
text=trim_text(text,text_language)
|
367 |
change_sovits_weights(sovits_path)
|
368 |
-
tprint(f'
|
369 |
change_gpt_weights(gpt_path)
|
370 |
-
tprint(f'
|
371 |
|
372 |
prompt_language = dict_language[prompt_language]
|
373 |
text_language = dict_language[text_language]
|
@@ -375,8 +377,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
375 |
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
|
376 |
text = text.strip("\n")
|
377 |
if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
|
378 |
-
print(("实际输入的参考文本:"), prompt_text)
|
379 |
-
print(("📝实际输入的目标文本:"), text)
|
380 |
zero_wav = np.zeros(
|
381 |
int(hps.data.sampling_rate * 0.3),
|
382 |
dtype=np.float16 if is_half == True else np.float32,
|
@@ -418,7 +420,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
418 |
text = cut5(text)
|
419 |
while "\n\n" in text:
|
420 |
text = text.replace("\n\n", "\n")
|
421 |
-
print(
|
422 |
texts = text.split("\n")
|
423 |
texts = merge_short_text_in_array(texts, 5)
|
424 |
audio_opt = []
|
@@ -428,7 +430,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
428 |
if (len(text.strip()) == 0):
|
429 |
continue
|
430 |
if (text[-1] not in splits): text += "。" if text_language != "en" else "."
|
431 |
-
print(("
|
432 |
phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
|
433 |
bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
|
434 |
bert = torch.cat([bert1, bert2], 1)
|
@@ -561,13 +563,16 @@ def cut5(inp):
|
|
561 |
# if not re.search(r'[^\w\s]', inp[-1]):
|
562 |
# inp += '。'
|
563 |
inp = inp.strip("\n")
|
564 |
-
punds = r'[
|
565 |
items = re.split(f'({punds})', inp)
|
566 |
-
|
567 |
-
|
|
|
|
|
568 |
return opt
|
569 |
|
570 |
|
|
|
571 |
def custom_sort_key(s):
|
572 |
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
573 |
parts = re.split('(\d+)', s)
|
@@ -580,7 +585,7 @@ def tprint(text):
|
|
580 |
print(f'UTC+8 - {now} - {text}')
|
581 |
|
582 |
def wprint(text):
|
583 |
-
|
584 |
gr.Warning(text)
|
585 |
|
586 |
#裁切文本
|
@@ -589,11 +594,13 @@ def trim_text(text,language):
|
|
589 |
limit_en = 60 #words
|
590 |
search_limit_cj = limit_cj+30
|
591 |
search_limit_en = limit_en +30
|
|
|
|
|
592 |
if language =='English':
|
593 |
words = text.split()
|
594 |
if len(words) <= limit_en:
|
595 |
return text
|
596 |
-
#
|
597 |
for i in range(limit_en, -1, -1):
|
598 |
if any(punct in words[i] for punct in splits):
|
599 |
return ' '.join(words[:i+1])
|
@@ -605,13 +612,13 @@ def trim_text(text,language):
|
|
605 |
else:#中文日文
|
606 |
if len(text) <= limit_cj:
|
607 |
return text
|
608 |
-
for i in range(limit_cj, -1, -1):
|
609 |
if text[i] in splits:
|
610 |
return text[:i+1]
|
611 |
-
for i in range(limit_cj, min(len(text), search_limit_cj)):
|
612 |
if text[i] in splits:
|
613 |
return text[:i+1]
|
614 |
-
return text[:limit_cj]
|
615 |
|
616 |
def duration(audio_file_path):
|
617 |
try:
|
@@ -670,7 +677,7 @@ def transcribe(voice):
|
|
670 |
|
671 |
time2=timer()
|
672 |
tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
|
673 |
-
tprint(f'\n
|
674 |
return text,language
|
675 |
|
676 |
def clone_voice(user_voice,user_text,user_lang):
|
@@ -679,7 +686,7 @@ def clone_voice(user_voice,user_text,user_lang):
|
|
679 |
if user_text == '':
|
680 |
wprint("Please enter text to generate/请输入生成文字")
|
681 |
return None
|
682 |
-
tprint('⚡Start clone')
|
683 |
user_text=trim_text(user_text,user_lang)
|
684 |
time1=timer()
|
685 |
global gpt_path, sovits_path
|
@@ -736,9 +743,9 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
|
|
736 |
chinese_models = [name for name, _ in models_by_language["中文"]]
|
737 |
japanese_models = [name for name, _ in models_by_language["日本語"]]
|
738 |
with gr.Row():
|
739 |
-
english_choice = gr.Radio(english_models, label="EN|English Model",value="Trump")
|
740 |
-
chinese_choice = gr.Radio(chinese_models, label="CN|中文模型")
|
741 |
-
japanese_choice = gr.Radio(japanese_models, label="JP|日本語モデル")
|
742 |
|
743 |
plsh='Text must match the selected language option to prevent errors, for example, if English is input but Chinese is selected for generation.\n文字一定要和语言选项匹配,不然要报错,比如输入的是英文,生成语言选中文'
|
744 |
limit='Max 70 words. Excess will be ignored./单次最多处理120字左右,多余的会被忽略'
|
@@ -784,7 +791,7 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
|
|
784 |
interactive=True,
|
785 |
info='A suitable splitting method can achieve better generation results'
|
786 |
)
|
787 |
-
volume = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.01, label='Volume')
|
788 |
|
789 |
|
790 |
|
@@ -809,7 +816,7 @@ with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
|
|
809 |
placeholder=plsh,info=limit)
|
810 |
|
811 |
user_button = gr.Button("✨Clone Voice", variant="primary")
|
812 |
-
user_output = gr.Audio(label="💾
|
813 |
|
814 |
gr.HTML('''<div align=center><img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.laobi.icu/badge?page_id=Ailyth/DLMP9" /></div>''')
|
815 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import soundfile as sf
|
|
|
16 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
17 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
18 |
|
19 |
+
|
20 |
+
import logging
|
21 |
+
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
22 |
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
23 |
+
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
24 |
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
25 |
+
logging.getLogger("asyncio").setLevel(logging.ERROR)
|
26 |
+
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
27 |
+
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
28 |
+
logging.getLogger("multipart").setLevel(logging.WARNING)
|
29 |
+
|
30 |
+
|
31 |
if "_CUDA_VISIBLE_DEVICES" in os.environ:
|
32 |
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
|
33 |
tz = pytz.timezone('Asia/Singapore')
|
|
|
367 |
startTime=timer()
|
368 |
text=trim_text(text,text_language)
|
369 |
change_sovits_weights(sovits_path)
|
370 |
+
tprint(f'🏕️LOADED SoVITS Model: {sovits_path}')
|
371 |
change_gpt_weights(gpt_path)
|
372 |
+
tprint(f'🏕️LOADED GPT Model: {gpt_path}')
|
373 |
|
374 |
prompt_language = dict_language[prompt_language]
|
375 |
text_language = dict_language[text_language]
|
|
|
377 |
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
|
378 |
text = text.strip("\n")
|
379 |
if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
|
380 |
+
#print(("实际输入的参考文本:"), prompt_text)
|
381 |
+
#print(("📝实际输入的目标文本:"), text)
|
382 |
zero_wav = np.zeros(
|
383 |
int(hps.data.sampling_rate * 0.3),
|
384 |
dtype=np.float16 if is_half == True else np.float32,
|
|
|
420 |
text = cut5(text)
|
421 |
while "\n\n" in text:
|
422 |
text = text.replace("\n\n", "\n")
|
423 |
+
print(f"🧨实际输入的目标文本(切句后):{text}\n")
|
424 |
texts = text.split("\n")
|
425 |
texts = merge_short_text_in_array(texts, 5)
|
426 |
audio_opt = []
|
|
|
430 |
if (len(text.strip()) == 0):
|
431 |
continue
|
432 |
if (text[-1] not in splits): text += "。" if text_language != "en" else "."
|
433 |
+
print(("\n🎈实际输入的目标文本(每句):"), text)
|
434 |
phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
|
435 |
bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
|
436 |
bert = torch.cat([bert1, bert2], 1)
|
|
|
563 |
# if not re.search(r'[^\w\s]', inp[-1]):
|
564 |
# inp += '。'
|
565 |
inp = inp.strip("\n")
|
566 |
+
punds = r'[,.;?!、,。?!;:…]'
|
567 |
items = re.split(f'({punds})', inp)
|
568 |
+
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
|
569 |
+
if len(items)%2 == 1:
|
570 |
+
mergeitems.append(items[-1])
|
571 |
+
opt = "\n".join(mergeitems)
|
572 |
return opt
|
573 |
|
574 |
|
575 |
+
|
576 |
def custom_sort_key(s):
|
577 |
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
578 |
parts = re.split('(\d+)', s)
|
|
|
585 |
print(f'UTC+8 - {now} - {text}')
|
586 |
|
587 |
def wprint(text):
|
588 |
+
tprint(text)
|
589 |
gr.Warning(text)
|
590 |
|
591 |
#裁切文本
|
|
|
594 |
limit_en = 60 #words
|
595 |
search_limit_cj = limit_cj+30
|
596 |
search_limit_en = limit_en +30
|
597 |
+
text = text.replace('\n', '').strip()
|
598 |
+
|
599 |
if language =='English':
|
600 |
words = text.split()
|
601 |
if len(words) <= limit_en:
|
602 |
return text
|
603 |
+
# English
|
604 |
for i in range(limit_en, -1, -1):
|
605 |
if any(punct in words[i] for punct in splits):
|
606 |
return ' '.join(words[:i+1])
|
|
|
612 |
else:#中文日文
|
613 |
if len(text) <= limit_cj:
|
614 |
return text
|
615 |
+
for i in range(limit_cj, -1, -1):
|
616 |
if text[i] in splits:
|
617 |
return text[:i+1]
|
618 |
+
for i in range(limit_cj, min(len(text), search_limit_cj)):
|
619 |
if text[i] in splits:
|
620 |
return text[:i+1]
|
621 |
+
return text[:limit_cj]
|
622 |
|
623 |
def duration(audio_file_path):
|
624 |
try:
|
|
|
677 |
|
678 |
time2=timer()
|
679 |
tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
|
680 |
+
tprint(f'\n🔣转录结果:\n 🔣Language:{language} \n 🔣Text:{text}' )
|
681 |
return text,language
|
682 |
|
683 |
def clone_voice(user_voice,user_text,user_lang):
|
|
|
686 |
if user_text == '':
|
687 |
wprint("Please enter text to generate/请输入生成文字")
|
688 |
return None
|
689 |
+
#tprint('⚡Start clone')
|
690 |
user_text=trim_text(user_text,user_lang)
|
691 |
time1=timer()
|
692 |
global gpt_path, sovits_path
|
|
|
743 |
chinese_models = [name for name, _ in models_by_language["中文"]]
|
744 |
japanese_models = [name for name, _ in models_by_language["日本語"]]
|
745 |
with gr.Row():
|
746 |
+
english_choice = gr.Radio(english_models, label="EN|English Model",value="Trump",scale=3)
|
747 |
+
chinese_choice = gr.Radio(chinese_models, label="CN|中文模型",scale=2)
|
748 |
+
japanese_choice = gr.Radio(japanese_models, label="JP|日本語モデル",scale=4)
|
749 |
|
750 |
plsh='Text must match the selected language option to prevent errors, for example, if English is input but Chinese is selected for generation.\n文字一定要和语言选项匹配,不然要报错,比如输入的是英文,生成语言选中文'
|
751 |
limit='Max 70 words. Excess will be ignored./单次最多处理120字左右,多余的会被忽略'
|
|
|
791 |
interactive=True,
|
792 |
info='A suitable splitting method can achieve better generation results'
|
793 |
)
|
794 |
+
volume = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.01, label='Volume/音量')
|
795 |
|
796 |
|
797 |
|
|
|
816 |
placeholder=plsh,info=limit)
|
817 |
|
818 |
user_button = gr.Button("✨Clone Voice", variant="primary")
|
819 |
+
user_output = gr.Audio(label="💾Download it by clicking ⬇️")
|
820 |
|
821 |
gr.HTML('''<div align=center><img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.laobi.icu/badge?page_id=Ailyth/DLMP9" /></div>''')
|
822 |
|
text/chinese.py
CHANGED
@@ -30,7 +30,7 @@ rep_map = {
|
|
30 |
"\n": ".",
|
31 |
"·": ",",
|
32 |
"、": ",",
|
33 |
-
"...": "…",
|
34 |
"$": ".",
|
35 |
"/": ",",
|
36 |
"—": "-",
|
@@ -169,4 +169,4 @@ if __name__ == "__main__":
|
|
169 |
|
170 |
# # 示例用法
|
171 |
# text = "这是一个示例文本:,你好!这是一个测试..."
|
172 |
-
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
|
|
30 |
"\n": ".",
|
31 |
"·": ",",
|
32 |
"、": ",",
|
33 |
+
# "...": "…",
|
34 |
"$": ".",
|
35 |
"/": ",",
|
36 |
"—": "-",
|
|
|
169 |
|
170 |
# # 示例用法
|
171 |
# text = "这是一个示例文本:,你好!这是一个测试..."
|
172 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
text/english.py
CHANGED
@@ -169,9 +169,9 @@ def read_dict_new():
|
|
169 |
line = line.strip()
|
170 |
word_split = line.split(" ")
|
171 |
word = word_split[0]
|
172 |
-
if word not in g2p_dict:
|
173 |
-
|
174 |
-
|
175 |
|
176 |
line_index = line_index + 1
|
177 |
line = f.readline()
|
@@ -231,4 +231,4 @@ if __name__ == "__main__":
|
|
231 |
# for group in syllables:
|
232 |
# for ph in group:
|
233 |
# all_phones.add(ph)
|
234 |
-
# print(all_phones)
|
|
|
169 |
line = line.strip()
|
170 |
word_split = line.split(" ")
|
171 |
word = word_split[0]
|
172 |
+
#if word not in g2p_dict:
|
173 |
+
g2p_dict[word] = []
|
174 |
+
g2p_dict[word].append(word_split[1:])
|
175 |
|
176 |
line_index = line_index + 1
|
177 |
line = f.readline()
|
|
|
231 |
# for group in syllables:
|
232 |
# for ph in group:
|
233 |
# all_phones.add(ph)
|
234 |
+
# print(all_phones)
|