Spaces:

Ailyth
/

Multi-voice-TTS-GPT-SoVITS

Running

App Files Files Community

Update app.py

by Garden-of-Pandora - opened Jul 28, 2024

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+328

-567

Files changed (1) hide show

app.py +328 -567

app.py CHANGED Viewed

@@ -1,24 +1,13 @@
-import gradio as gr
-import numpy as np
-import soundfile as sf
-from datetime import datetime
-from time import time as ttime
-from my_utils import load_audio
-from transformers import pipeline
-from text.cleaner import clean_text
-from polyglot.detect import  Detector
-from feature_extractor import cnhubert
-from timeit import default_timer as timer
-from text import cleaned_text_to_sequence
-from module.models  import  SynthesizerTrn
-from module.mel_processing import spectrogram_torch
-from transformers.pipelines.audio_utils import ffmpeg_read
-import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
-import logging
 logging.getLogger("markdown_it").setLevel(logging.ERROR)
 logging.getLogger("urllib3").setLevel(logging.ERROR)
 logging.getLogger("httpcore").setLevel(logging.ERROR)
@@ -26,43 +15,67 @@ logging.getLogger("httpx").setLevel(logging.ERROR)
 logging.getLogger("asyncio").setLevel(logging.ERROR)
 logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
 logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
-logging.getLogger("multipart").setLevel(logging.WARNING)
-from download import *
-download()
 if "_CUDA_VISIBLE_DEVICES" in os.environ:
     os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
-tz = pytz.timezone('Asia/Singapore')
-device = "cuda" if torch.cuda.is_available() else "cpu"
-def abs_path(dir):
-    global_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
-    return(os.path.join(global_dir, dir))
-gpt_path = abs_path("MODELS/22/22.ckpt")
-sovits_path=abs_path("MODELS/22/22.pth")
-cnhubert_base_path = os.environ.get("cnhubert_base_path", "pretrained_models/chinese-hubert-base")
-bert_path = os.environ.get("bert_path", "pretrained_models/chinese-roberta-wwm-ext-large")
-if not os.path.exists(cnhubert_base_path):
-    cnhubert_base_path = "TencentGameMate/chinese-hubert-base"
-if not os.path.exists(bert_path):
-    bert_path = "hfl/chinese-roberta-wwm-ext-large"
 cnhubert.cnhubert_base_path = cnhubert_base_path
-whisper_path = os.environ.get("whisper_path", "pretrained_models/whisper-tiny")
-if not os.path.exists(whisper_path):
-    whisper_path = "openai/whisper-tiny"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=whisper_path,
-    chunk_length_s=30,
-    device=device,)
-is_half = eval(
-    os.environ.get("is_half", "True" if torch.cuda.is_available() else "False")
-)
 tokenizer = AutoTokenizer.from_pretrained(bert_path)
 bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
@@ -187,63 +200,17 @@ def get_spepc(hps, filename):
 dict_language = {
-    ("中文1"): "all_zh",#全部按中文识别
-    ("English"): "en",#全部按英文识别#######不变
-    ("日文1"): "all_ja",#全部按日文识别
-    ("中文"): "zh",#按中英混合识别####不变
-    ("日本語"): "ja",#按日英混合识别####不变
-    ("混合"): "auto",#多语种启动切分识别语种
 }
-def splite_en_inf(sentence, language):
-    pattern = re.compile(r'[a-zA-Z ]+')
-    textlist = []
-    langlist = []
-    pos = 0
-    for match in pattern.finditer(sentence):
-        start, end = match.span()
-        if start > pos:
-            textlist.append(sentence[pos:start])
-            langlist.append(language)
-        textlist.append(sentence[start:end])
-        langlist.append("en")
-        pos = end
-    if pos < len(sentence):
-        textlist.append(sentence[pos:])
-        langlist.append(language)
-    # Merge punctuation into previous word
-    for i in range(len(textlist)-1, 0, -1):
-        if re.match(r'^[\W_]+$', textlist[i]):
-            textlist[i-1] += textlist[i]
-            del textlist[i]
-            del langlist[i]
-    # Merge consecutive words with the same language tag
-    i = 0
-    while i < len(langlist) - 1:
-        if langlist[i] == langlist[i+1]:
-            textlist[i] += textlist[i+1]
-            del textlist[i+1]
-            del langlist[i+1]
-        else:
-            i += 1
-    return textlist, langlist
 def clean_text_inf(text, language):
-    formattext = ""
-    language = language.replace("all_","")
-    for tmp in LangSegment.getTexts(text):
-        if language == "ja":
-            if tmp["lang"] == language or tmp["lang"] == "zh":
-                formattext += tmp["text"] + " "
-            continue
-        if tmp["lang"] == language:
-            formattext += tmp["text"] + " "
-    while "  " in formattext:
-        formattext = formattext.replace("  ", " ")
-    phones, word2ph, norm_text = clean_text(formattext, language)
     phones = cleaned_text_to_sequence(phones)
     return phones, word2ph, norm_text
@@ -261,57 +228,6 @@ def get_bert_inf(phones, word2ph, norm_text, language):
     return bert
-def nonen_clean_text_inf(text, language):
-    if(language!="auto"):
-        textlist, langlist = splite_en_inf(text, language)
-    else:
-        textlist=[]
-        langlist=[]
-        for tmp in LangSegment.getTexts(text):
-            langlist.append(tmp["lang"])
-            textlist.append(tmp["text"])
-    print(textlist)
-    print(langlist)
-    phones_list = []
-    word2ph_list = []
-    norm_text_list = []
-    for i in range(len(textlist)):
-        lang = langlist[i]
-        phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
-        phones_list.append(phones)
-        if lang == "zh":
-            word2ph_list.append(word2ph)
-        norm_text_list.append(norm_text)
-    print(word2ph_list)
-    phones = sum(phones_list, [])
-    word2ph = sum(word2ph_list, [])
-    norm_text = ' '.join(norm_text_list)
-    return phones, word2ph, norm_text
-def nonen_get_bert_inf(text, language):
-    if(language!="auto"):
-        textlist, langlist = splite_en_inf(text, language)
-    else:
-        textlist=[]
-        langlist=[]
-        for tmp in LangSegment.getTexts(text):
-            langlist.append(tmp["lang"])
-            textlist.append(tmp["text"])
-    print(textlist)
-    print(langlist)
-    bert_list = []
-    for i in range(len(textlist)):
-        lang = langlist[i]
-        phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
-        bert = get_bert_inf(phones, word2ph, norm_text, lang)
-        bert_list.append(bert)
-    bert = torch.cat(bert_list, dim=1)
-    return bert
 splits = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…", }
@@ -321,23 +237,63 @@ def get_first(text):
     return text
-def get_cleaned_text_final(text,language):
     if language in {"en","all_zh","all_ja"}:
-        phones, word2ph, norm_text = clean_text_inf(text, language)
     elif language in {"zh", "ja","auto"}:
-        phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
-    return phones, word2ph, norm_text
-def get_bert_final(phones, word2ph, text,language,device):
-    if language == "en":
-        bert = get_bert_inf(phones, word2ph, text, language)
-    elif language in {"zh", "ja","auto"}:
-        bert = nonen_get_bert_inf(text, language)
-    elif language == "all_zh":
-        bert = get_bert_feature(text, word2ph).to(device)
-    else:
-        bert = torch.zeros((1024, len(phones))).to(device)
-    return bert
 def merge_short_text_in_array(texts, threshold):
     if (len(texts)) < 2:
@@ -356,108 +312,100 @@ def merge_short_text_in_array(texts, threshold):
             result[len(result) - 1] += text
     return result
-def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=("Do not split"), volume_scale=1.0):
-    if not duration(ref_wav_path):
-        return None
-    if  text == '':
-        wprint("Please enter text to generate/请输入生成文字")
-        return None
     t0 = ttime()
-    startTime=timer()
-    text=trim_text(text,text_language)
-    change_sovits_weights(sovits_path)
-    tprint(f'🏕️LOADED SoVITS Model: {sovits_path}')
-    change_gpt_weights(gpt_path)
-    tprint(f'🏕️LOADED GPT Model: {gpt_path}')
     prompt_language = dict_language[prompt_language]
-    try:
-        text_language = dict_language[text_language]
-    except KeyError as e:
-        wprint(f"Unsupported language type: {e}")
-        return None
-    prompt_text = prompt_text.strip("\n")
-    if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
     text = text.strip("\n")
     if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
-    #print(("实际输入的参考文本:"), prompt_text)
-    #print(("📝实际输入的目标文本:"), text)
     zero_wav = np.zeros(
         int(hps.data.sampling_rate * 0.3),
         dtype=np.float16 if is_half == True else np.float32,
     )
-    with torch.no_grad():
-        wav16k, sr = librosa.load(ref_wav_path, sr=16000)
-        if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000):
-            errinfo='参考音频在3~10秒范围外，请更换！'
-            raise OSError((errinfo))
-        wav16k = torch.from_numpy(wav16k)
-        zero_wav_torch = torch.from_numpy(zero_wav)
-        if is_half == True:
-            wav16k = wav16k.half().to(device)
-            zero_wav_torch = zero_wav_torch.half().to(device)
-        else:
-            wav16k = wav16k.to(device)
-            zero_wav_torch = zero_wav_torch.to(device)
-        wav16k = torch.cat([wav16k, zero_wav_torch])
-        ssl_content = ssl_model.model(wav16k.unsqueeze(0))[
-            "last_hidden_state"
-        ].transpose(
-            1, 2
-        )  # .float()
-        codes = vq_model.extract_latent(ssl_content)
-        prompt_semantic = codes[0, 0]
-    t1 = ttime()
-    phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language)
-    if (how_to_cut == ("Split into groups of 4 sentences")):
         text = cut1(text)
-    elif (how_to_cut == ("Split every 50 characters")):
         text = cut2(text)
-    elif (how_to_cut == ("Split at CN/JP periods (。)")):
         text = cut3(text)
-    elif (how_to_cut == ("Split at English periods (.)")):
         text = cut4(text)
-    elif (how_to_cut == ("Split at punctuation marks")):
         text = cut5(text)
     while "\n\n" in text:
         text = text.replace("\n\n", "\n")
-    print(f"🧨实际输入的目标文本(切句后):{text}\n")
     texts = text.split("\n")
     texts = merge_short_text_in_array(texts, 5)
     audio_opt = []
-    bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype)
     for text in texts:
         if (len(text.strip()) == 0):
             continue
         if (text[-1] not in splits): text += "。" if text_language != "en" else "."
-        print(("\n🎈实际输入的目标文本(每句):"), text)
-        phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
-        try:
-            bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
-        except RuntimeError as e:
-            wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
-            return None
-        bert = torch.cat([bert1, bert2], 1)
-        all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
         bert = bert.to(device).unsqueeze(0)
         all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
-        prompt = prompt_semantic.unsqueeze(0).to(device)
         t2 = ttime()
         with torch.no_grad():
             # pred_semantic = t2s_model.model.infer(
             pred_semantic, idx = t2s_model.model.infer_panel(
                 all_phoneme_ids,
                 all_phoneme_len,
-                prompt,
                 bert,
                 # prompt_phone_len=ph_offset,
-                top_k=config["inference"]["top_k"],
                 early_stop_num=hz * max_sec,
             )
         t3 = ttime()
@@ -471,34 +419,24 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
         else:
             refer = refer.to(device)
         # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
-        try:
-          audio = (
             vq_model.decode(
                 pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
             )
                 .detach()
                 .cpu()
                 .numpy()[0, 0]
-        )
-        except RuntimeError as e:
-            wprint(f"The input text does not match the language/输入文本与语言不匹配: {e}")
-            return None
-        max_audio=np.abs(audio).max()
         if max_audio>1:audio/=max_audio
         audio_opt.append(audio)
         audio_opt.append(zero_wav)
         t4 = ttime()
     print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
-    #yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
-    audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
-    audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
-    output_wav = "output_audio.wav"
-    sf.write(output_wav, audio_data, hps.data.sampling_rate)
-    endTime=timer()
-    tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
-    return output_wav
 def split(todo_text):
     todo_text = todo_text.replace("……", "。").replace("——", "，")
@@ -509,7 +447,7 @@ def split(todo_text):
     todo_texts = []
     while 1:
         if i_split_head >= len_text:
-            break
         if todo_text[i_split_head] in splits:
             i_split_head += 1
             todo_texts.append(todo_text[i_split_tail:i_split_head])
@@ -530,6 +468,7 @@ def cut1(inp):
             opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
     else:
         opts = [inp]
     return "\n".join(opts)
@@ -551,35 +490,49 @@ def cut2(inp):
     if tmp_str != "":
         opts.append(tmp_str)
     # print(opts)
-    if len(opts) > 1 and len(opts[-1]) < 50:
         opts[-2] = opts[-2] + opts[-1]
         opts = opts[:-1]
     return "\n".join(opts)
 def cut3(inp):
     inp = inp.strip("\n")
-    return "\n".join(["%s" % item for item in inp.strip("。").split("。")])
 def cut4(inp):
     inp = inp.strip("\n")
-    return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
 # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
 def cut5(inp):
-    # if not re.search(r'[^\w\s]', inp[-1]):
-    # inp += '。'
     inp = inp.strip("\n")
-    punds = r'[,.;?!、，。？！;：…]'
-    items = re.split(f'({punds})', inp)
-    mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
-    if len(items)%2 == 1:
-        mergeitems.append(items[-1])
-    opt = "\n".join(mergeitems)
-    return opt
 def custom_sort_key(s):
@@ -589,312 +542,120 @@ def custom_sort_key(s):
     parts = [int(part) if part.isdigit() else part for part in parts]
     return parts
-#==========custom functions============
-def tprint(text):
-    now=datetime.now(tz).strftime('%H:%M:%S')
-    print(f'UTC+8 - {now} - {text}')
-def wprint(text):
-    tprint(text)
-    gr.Warning(text)
-def lang_detector(text):
-    min_chars = 5
-    if len(text) < min_chars:
-        return "Input text too short/输入文本太短"
-    try:
-        detector = Detector(text).language
-        lang_info = str(detector)
-        code = re.search(r"name: (\w+)", lang_info).group(1)
-        if code == 'Japanese':
-            return "日本語"
-        elif code == 'Chinese':
-            return "中文"
-        elif code == 'English':
-            return 'English'
         else:
-            return code
-    except Exception as e:
-        return f"ERROR：{str(e)}"
-def trim_text(text,language):
-    limit_cj = 120 #character
-    limit_en = 60 #words
-    search_limit_cj = limit_cj+30
-    search_limit_en = limit_en +30
-    text = text.replace('\n', '').strip()
-    if language =='English':
-        words = text.split()
-        if len(words) <= limit_en:
-            return text
-        # English
-        for i in range(limit_en, -1, -1):
-            if any(punct in words[i] for punct in splits):
-                return ' '.join(words[:i+1])
-        for i in range(limit_en, min(len(words), search_limit_en)):
-            if any(punct in words[i] for punct in splits):
-                return ' '.join(words[:i+1])
-        return ' '.join(words[:limit_en])
-    else:#中文日文
-        if len(text) <= limit_cj:
-            return text
-        for i in range(limit_cj, -1, -1):
-            if text[i] in splits:
-                return text[:i+1]
-        for i in range(limit_cj, min(len(text), search_limit_cj)):
-            if text[i] in splits:
-                return text[:i+1]
-        return text[:limit_cj]
-def duration(audio_file_path):
-    if not audio_file_path:
-        wprint("Failed to obtain uploaded audio/未找到音频文件")
-        return False
-    try:
-        audio_duration = librosa.get_duration(filename=audio_file_path)
-        if not 3 < audio_duration < 10:
-            wprint("The audio length must be between 3~10 seconds/音频时长须在3~10秒之间")
-            return False
-        return True
-    except FileNotFoundError:
-        return False
-def update_model(choice):
-    global gpt_path, sovits_path
-    model_info = models[choice]
-    gpt_path = abs_path(model_info["gpt_weight"])
-    sovits_path = abs_path(model_info["sovits_weight"])
-    model_name = choice
-    tone_info = model_info["tones"]["tone1"]
-    tone_sample_path = abs_path(tone_info["sample"])
-    tprint(f'✅SELECT MODEL：{choice}')
-    # 返回默认tone“tone1”
-    return (
-        tone_info["example_voice_wav"],
-        tone_info["example_voice_wav_words"],
-        model_info["default_language"],
-        model_info["default_language"],
-        model_name,
-        "tone1"  ,
-        tone_sample_path
-    )
-def update_tone(model_choice, tone_choice):
-    model_info = models[model_choice]
-    tone_info = model_info["tones"][tone_choice]
-    example_voice_wav = abs_path(tone_info["example_voice_wav"])
-    example_voice_wav_words = tone_info["example_voice_wav_words"]
-    tone_sample_path = abs_path(tone_info["sample"])
-    return example_voice_wav, example_voice_wav_words,tone_sample_path
-def transcribe(voice):
-    time1=timer()
-    tprint('⚡Start Clone - transcribe')
-    task="transcribe"
-    if voice is None:
-        wprint("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    R = pipe(voice, batch_size=8, generate_kwargs={"task": task}, return_timestamps=True,return_language=True)
-    text=R['text']
-    lang=R['chunks'][0]['language']
-    if lang=='english':
-      language='English'
-    elif lang =='chinese':
-      language='中文'
-    elif lang=='japanese':
-      language = '日本語'
-    time2=timer()
-    tprint(f'transcribe COMPLETE,{round(time2-time1,4)}s')
-    tprint(f'\nTRANSCRIBE RESULT：\n 🔣Language：{language} \n 🔣Text：{text}' )
-    return  text,language
-def clone_voice(user_voice,user_text,user_lang):
-    if not duration(user_voice):
-        return None
-    if  user_text == '':
-        wprint("Please enter text to generate/请输入生成文字")
-        return None
-    user_text=trim_text(user_text,user_lang)
-    time1=timer()
-    global gpt_path, sovits_path
-    gpt_path = abs_path("pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
-    #tprint(f'Model loaded:{gpt_path}')
-    sovits_path = abs_path("pretrained_models/s2G488k.pth")
-    #tprint(f'Model loaded:{sovits_path}')
-    try:
-        prompt_text, prompt_language = transcribe(user_voice)
-    except UnboundLocalError as e:
-        wprint(f"The language in the audio cannot be recognized ：{str(e)}")
-        return None
-    output_wav = get_tts_wav(
-    user_voice,
-    prompt_text,
-    prompt_language,
-    user_text,
-    user_lang,
-    how_to_cut="Do not split",
-    volume_scale=1.0)
-    time2=timer()
-    tprint(f'🆗CLONE COMPLETE,{round(time2-time1,4)}s')
-    return output_wav
-with open('dummy') as f:
-    dummy_txt = f.read().strip().splitlines()
-def dice():
-    return random.choice(dummy_txt), '🎲'
-from info import models
-models_by_language = {
-    "English": [],
-    "中文": [],
-    "日本語": []
-}
-for model_name, model_info in models.items():
-    language = model_info["default_language"]
-    models_by_language[language].append((model_name, model_info))
-##########GRADIO###########
-with gr.Blocks(theme='Kasien/ali_theme_custom') as app:
-    gr.HTML('''
-  <h1 style="font-size: 25px;">TEXT TO SPEECH</h1>
-  <h1 style="font-size: 20px;">Support English/Chinese/Japanese</h1>
-  <p style="margin-bottom: 10px; font-size: 100%">
-   If you like this space, please click the ❤️ at the top of the page..如喜欢，请点一下页面顶部的❤️<br>
-  </p>''')
-    gr.Markdown("""* This space is based on the text-to-speech generation solution [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) .
-    You can visit the repo's github homepage to learn training and inference.<br>
-    本空间基于文字转语音生成方案 [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS). 你可以前往项目的github主页学习如何推理和训练。
-    * ⚠️Generating voice is very slow due to using HuggingFace's free CPU in this space.
-    For faster generation, click the Colab icon below to use this space in Colab,
-    which will significantly improve the speed.<br>
-    由于本空间使用huggingface的免费CPU进行推理，因此速度很慢，如想快速生成，请点击下方的Colab图标，
-    前往Colab使用已获得更快的生成速度。
-    <br>Colabの使用を強くお勧めします。より速い生成速度が得られます。
-    *  each model can speak three languages.<br>每个模型都能说三种语言<br>各モデルは3つの言語を話すことができます。""")
-    gr.HTML('''<a href="https://colab.research.google.com/drive/1fTuPZ4tZsAjS-TrhQWMCb7KRdnU8aF6j" target="_blank"><img src="https://camo.githubusercontent.com/dd83d4a334eab7ada034c13747d9e2237182826d32e3fda6629740b6e02f18d8/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6c61622d4639414230303f7374796c653d666f722d7468652d6261646765266c6f676f3d676f6f676c65636f6c616226636f6c6f723d353235323532" alt="colab"></a>
-''')
-    default_voice_wav, default_voice_wav_words, default_language, _, default_model_name, _, default_tone_sample_path = update_model("Trump")
-    english_models = [name for name, _ in models_by_language["English"]]
-    chinese_models = [name for name, _ in models_by_language["中文"]]
-    japanese_models = [name for name, _ in models_by_language["日本語"]]
-    with gr.Row():
-        english_choice = gr.Radio(english_models, label="EN",value="Trump",scale=3)
-        chinese_choice = gr.Radio(chinese_models, label="ZH",scale=2)
-        japanese_choice = gr.Radio(japanese_models, label="JA",scale=4)
-    plsh='''
-      Support【English/中文/日本語】，Input text here / 在这輸入文字 /ここにテキストを入力する。
-      If you don't know what to input, you can click the dice on the right, and random text will appear.
-      如果你不知道输入什么，可以点击右边的骰子，会出现随机文本。
-      入力するものがわからない場合は、右側のサイコロをクリックすると、ランダムなテキストが表示されます。
-    '''
-    limit='Max 70 words. Excess will be ignored./单次最多处理120字左右，多余的会被忽略'
-    gr.HTML('''
-    <b>Input Text/输入文字</b>''')
-    with gr.Row():
-        with gr.Column(scale=2):
-            model_name = gr.Textbox(label="Seleted Model/已选模型", value=default_model_name, interactive=False,scale=1,)
-            text_language = gr.Textbox(
-            label="Language for input text/生成语言",
-            info='Automatic detection of input language type.',scale=1,interactive=False
-            )
-        text = gr.Textbox(label="INPUT TEXT", lines=5,placeholder=plsh,info=limit,scale=10,min_width=0)
-        ddice= gr.Button('🎲', variant='tool',min_width=0,scale=0)
-        ddice.click(dice, outputs=[text, ddice])
-        text.change( lang_detector, text, text_language)
-    with gr.Row():
-        with gr.Column(scale=2):
-            tone_select = gr.Radio(
-            label="Select Tone/选择语气",
-            choices=["tone1","tone2","tone3"],
-            value="tone1",
-            info='Tone influences the emotional expression ',scale=1)
-        tone_sample=gr.Audio(label="🔊Preview tone/试听语气 ", scale=8)
-    with gr.Accordion(label="prpt voice", open=False,visible=False):
-        with gr.Row(visible=True):
-            inp_ref = gr.Audio(label="Reference audio", type="filepath", value=default_voice_wav, scale=3)
-            prompt_text = gr.Textbox(label="Reference text", value=default_voice_wav_words, scale=3)
-            prompt_language = gr.Dropdown(label="Language of the reference audio", choices=["中文", "English", "日本語"], value=default_language, scale=1,interactive=False)
-            dummy = gr.Radio(choices=["中文","English","日本語"],visible=False)
-    with gr.Accordion(label="Additional generation options/附加生成选项", open=False):
-        how_to_cut = gr.Dropdown(
-                label=("How to split?"),
-                choices=[("Do not split"), ("Split into groups of 4 sentences"), ("Split every 50 characters"),
-                         ("Split at CN/JP periods (。)"), ("Split at English periods (.)"), ("Split at punctuation marks"), ],
-                value=("Split into groups of 4 sentences"),
-                interactive=True,
-            info='A suitable splitting method can achieve better generation results'
-            )
-        volume = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.01, label='Volume/音量')
-    gr.HTML('''
-    <b>Generate Voice/生成</b>''')
-    with gr.Row():
-        main_button = gr.Button("✨Generate Voice", variant="primary", scale=2)
-        output = gr.Audio(label="💾Download it by clicking ⬇️", scale=6)
-        #info = gr.Textbox(label="INFO", visible=True, readonly=True, scale=1)
-    gr.HTML('''
-    Generation is slower, please be patient and wait/合成比较慢，请耐心等待<br>
-    If it generated silence, please try again./如果生成了空白声音，请重试
-    <br><br><br><br>
-    <h1 style="font-size: 25px;">Clone custom Voice/克隆自定义声音</h1>
-    <p style="margin-bottom: 10px; font-size: 100%">
-    需要3~10秒语音，克隆后的声音和原音相似度80%以上<br>
-    Requires 3-10 seconds of voice input. The cloned voice will have a similarity of 80% or above compared to the original.<br>
-    3~10秒の音声入力が必要です。クローンされた音声は、オリジナルと80%以上の類似性があります。
-    </p>''')
-    with gr.Row():
-        user_voice = gr.Audio(type="filepath", label="（3~10s）Upload or Record audio/上传或录制声音",scale=3)
-        with gr.Column(scale=7):
-            user_lang = gr.Textbox(label="Language/生成语言",info='Automatic detection of input language type.',interactive=False)
-            with gr.Row():
-                user_text= gr.Textbox(label="Text for generation/输入想要生成语音的文字", lines=5,placeholder=plsh,info=limit)
-                dddice= gr.Button('🎲', variant='tool',min_width=0,scale=0)
-    dddice.click(dice, outputs=[user_text, dddice])
-    user_text.change( lang_detector, user_text, user_lang)
-    user_button = gr.Button("✨Clone Voice", variant="primary")
-    user_output = gr.Audio(label="💾Download it by clicking ⬇️")
-    gr.HTML('''<div align=center><img id="visitor-badge" alt="visitor badge" src="https://visitor-badge.laobi.icu/badge?page_id=Ailyth/DLMP9" /></div>''')
-    english_choice.change(update_model, inputs=[english_choice], outputs=[inp_ref, prompt_text, prompt_language,dummy,model_name, tone_select, tone_sample])
-    chinese_choice.change(update_model, inputs=[chinese_choice], outputs=[inp_ref, prompt_text, prompt_language, dummy,model_name, tone_select, tone_sample])
-    japanese_choice.change(update_model, inputs=[japanese_choice], outputs=[inp_ref, prompt_text, prompt_language,dummy,model_name, tone_select, tone_sample])
-    tone_select.change(update_tone, inputs=[model_name, tone_select], outputs=[inp_ref, prompt_text, tone_sample])
-    main_button.click(
-    get_tts_wav,
-    inputs=[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut,volume],
-    outputs=[output])
-    user_button.click(
-    clone_voice,
-    inputs=[user_voice,user_text,user_lang],
-    outputs=[user_output])
-app.launch(share=True, show_api=False).queue(api_open=False)

+'''
+按中英混合识别
+按日英混合识别
+多语种启动切分识别语种
+全部按中文识别
+全部按英文识别
+全部按日文识别
+'''
+import os, re, logging
+import LangSegment
 logging.getLogger("markdown_it").setLevel(logging.ERROR)
 logging.getLogger("urllib3").setLevel(logging.ERROR)
 logging.getLogger("httpcore").setLevel(logging.ERROR)
 logging.getLogger("asyncio").setLevel(logging.ERROR)
 logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
 logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
+import pdb
+import torch
+if os.path.exists("./gweight.txt"):
+    with open("./gweight.txt", 'r', encoding="utf-8") as file:
+        gweight_data = file.read()
+        gpt_path = os.environ.get(
+            "gpt_path", gweight_data)
+else:
+    gpt_path = os.environ.get(
+        "gpt_path", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt")
+if os.path.exists("./sweight.txt"):
+    with open("./sweight.txt", 'r', encoding="utf-8") as file:
+        sweight_data = file.read()
+        sovits_path = os.environ.get("sovits_path", sweight_data)
+else:
+    sovits_path = os.environ.get("sovits_path", "GPT_SoVITS/pretrained_models/s2G488k.pth")
+# gpt_path = os.environ.get(
+#     "gpt_path", "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+# )
+# sovits_path = os.environ.get("sovits_path", "pretrained_models/s2G488k.pth")
+cnhubert_base_path = os.environ.get(
+    "cnhubert_base_path", "GPT_SoVITS/pretrained_models/chinese-hubert-base"
+)
+bert_path = os.environ.get(
+    "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
+)
+infer_ttswebui = os.environ.get("infer_ttswebui", 9872)
+infer_ttswebui = int(infer_ttswebui)
+is_share = os.environ.get("is_share", "False")
+is_share = eval(is_share)
 if "_CUDA_VISIBLE_DEVICES" in os.environ:
     os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
+is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
+punctuation = set(['!', '?', '…', ',', '.', '-'," "])
+import gradio as gr
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+import numpy as np
+import librosa
+from feature_extractor import cnhubert
 cnhubert.cnhubert_base_path = cnhubert_base_path
+from module.models import SynthesizerTrn
+from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from text import cleaned_text_to_sequence
+from text.cleaner import clean_text
+from time import time as ttime
+from module.mel_processing import spectrogram_torch
+from tools.my_utils import load_audio
+from tools.i18n.i18n import I18nAuto
+i18n = I18nAuto()
+# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 确保直接启动推理UI时也能够设置。
+if torch.cuda.is_available():
+    device = "cuda"
+else:
+    device = "cpu"
 tokenizer = AutoTokenizer.from_pretrained(bert_path)
 bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
 dict_language = {
+    i18n("中文"): "all_zh",#全部按中文识别
+    i18n("英文"): "en",#全部按英文识别#######不变
+    i18n("日文"): "all_ja",#全部按日文识别
+    i18n("中英混合"): "zh",#按中英混合识别####不变
+    i18n("日英混合"): "ja",#按日英混合识别####不变
+    i18n("多语种混合"): "auto",#多语种启动切分识别语种
 }
 def clean_text_inf(text, language):
+    phones, word2ph, norm_text = clean_text(text, language)
     phones = cleaned_text_to_sequence(phones)
     return phones, word2ph, norm_text
     return bert
 splits = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…", }
     return text
+def get_phones_and_bert(text,language):
     if language in {"en","all_zh","all_ja"}:
+        language = language.replace("all_","")
+        if language == "en":
+            LangSegment.setfilters(["en"])
+            formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
+        else:
+            # 因无法区别中日文汉字,以用户输入为准
+            formattext = text
+        while "  " in formattext:
+            formattext = formattext.replace("  ", " ")
+        phones, word2ph, norm_text = clean_text_inf(formattext, language)
+        if language == "zh":
+            bert = get_bert_feature(norm_text, word2ph).to(device)
+        else:
+            bert = torch.zeros(
+                (1024, len(phones)),
+                dtype=torch.float16 if is_half == True else torch.float32,
+            ).to(device)
     elif language in {"zh", "ja","auto"}:
+        textlist=[]
+        langlist=[]
+        LangSegment.setfilters(["zh","ja","en","ko"])
+        if language == "auto":
+            for tmp in LangSegment.getTexts(text):
+                if tmp["lang"] == "ko":
+                    langlist.append("zh")
+                    textlist.append(tmp["text"])
+                else:
+                    langlist.append(tmp["lang"])
+                    textlist.append(tmp["text"])
+        else:
+            for tmp in LangSegment.getTexts(text):
+                if tmp["lang"] == "en":
+                    langlist.append(tmp["lang"])
+                else:
+                    # 因无法区别中日文汉字,以用户输入为准
+                    langlist.append(language)
+                textlist.append(tmp["text"])
+        print(textlist)
+        print(langlist)
+        phones_list = []
+        bert_list = []
+        norm_text_list = []
+        for i in range(len(textlist)):
+            lang = langlist[i]
+            phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
+            bert = get_bert_inf(phones, word2ph, norm_text, lang)
+            phones_list.append(phones)
+            norm_text_list.append(norm_text)
+            bert_list.append(bert)
+        bert = torch.cat(bert_list, dim=1)
+        phones = sum(phones_list, [])
+        norm_text = ''.join(norm_text_list)
+    return phones,bert.to(dtype),norm_text
 def merge_short_text_in_array(texts, threshold):
     if (len(texts)) < 2:
             result[len(result) - 1] += text
     return result
+def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False):
+    if prompt_text is None or len(prompt_text) == 0:
+        ref_free = True
     t0 = ttime()
     prompt_language = dict_language[prompt_language]
+    text_language = dict_language[text_language]
+    if not ref_free:
+        prompt_text = prompt_text.strip("\n")
+        if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_language != "en" else "."
+        print(i18n("实际输入的参考文本:"), prompt_text)
     text = text.strip("\n")
+    text = replace_consecutive_punctuation(text)
     if (text[0] not in splits and len(get_first(text)) < 4): text = "。" + text if text_language != "en" else "." + text
+    print(i18n("实际输入的目标文本:"), text)
     zero_wav = np.zeros(
         int(hps.data.sampling_rate * 0.3),
         dtype=np.float16 if is_half == True else np.float32,
     )
+    if not ref_free:
+        with torch.no_grad():
+            wav16k, sr = librosa.load(ref_wav_path, sr=16000)
+            if (wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000):
+                raise OSError(i18n("参考音频在3~10秒范围外，请更换！"))
+            wav16k = torch.from_numpy(wav16k)
+            zero_wav_torch = torch.from_numpy(zero_wav)
+            if is_half == True:
+                wav16k = wav16k.half().to(device)
+                zero_wav_torch = zero_wav_torch.half().to(device)
+            else:
+                wav16k = wav16k.to(device)
+                zero_wav_torch = zero_wav_torch.to(device)
+            wav16k = torch.cat([wav16k, zero_wav_torch])
+            ssl_content = ssl_model.model(wav16k.unsqueeze(0))[
+                "last_hidden_state"
+            ].transpose(
+                1, 2
+            )  # .float()
+            codes = vq_model.extract_latent(ssl_content)
+            prompt_semantic = codes[0, 0]
+            prompt = prompt_semantic.unsqueeze(0).to(device)
+    t1 = ttime()
+    if (how_to_cut == i18n("凑四句一切")):
         text = cut1(text)
+    elif (how_to_cut == i18n("凑50字一切")):
         text = cut2(text)
+    elif (how_to_cut == i18n("按中文句号。切")):
         text = cut3(text)
+    elif (how_to_cut == i18n("按英文句号.切")):
         text = cut4(text)
+    elif (how_to_cut == i18n("按标点符号切")):
         text = cut5(text)
     while "\n\n" in text:
         text = text.replace("\n\n", "\n")
+    print(i18n("实际输入的目标文本(切句后):"), text)
     texts = text.split("\n")
+    texts = process_text(texts)
     texts = merge_short_text_in_array(texts, 5)
     audio_opt = []
+    if not ref_free:
+        phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language)
     for text in texts:
+        # 解决输入目标文本的空行导致报错的问题
         if (len(text.strip()) == 0):
             continue
         if (text[-1] not in splits): text += "。" if text_language != "en" else "."
+        print(i18n("实际输入的目标文本(每句):"), text)
+        phones2,bert2,norm_text2=get_phones_and_bert(text, text_language)
+        print(i18n("前端处理后的文本(每句):"), norm_text2)
+        if not ref_free:
+            bert = torch.cat([bert1, bert2], 1)
+            all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
+        else:
+            bert = bert2
+            all_phoneme_ids = torch.LongTensor(phones2).to(device).unsqueeze(0)
         bert = bert.to(device).unsqueeze(0)
         all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
         t2 = ttime()
         with torch.no_grad():
             # pred_semantic = t2s_model.model.infer(
             pred_semantic, idx = t2s_model.model.infer_panel(
                 all_phoneme_ids,
                 all_phoneme_len,
+                None if ref_free else prompt,
                 bert,
                 # prompt_phone_len=ph_offset,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
                 early_stop_num=hz * max_sec,
             )
         t3 = ttime()
         else:
             refer = refer.to(device)
         # audio = vq_model.decode(pred_semantic, all_phoneme_ids, refer).detach().cpu().numpy()[0, 0]
+        audio = (
             vq_model.decode(
                 pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refer
             )
                 .detach()
                 .cpu()
                 .numpy()[0, 0]
+        )  ###试试重建不带上prompt部分
+        max_audio=np.abs(audio).max()#简单防止16bit爆音
         if max_audio>1:audio/=max_audio
         audio_opt.append(audio)
         audio_opt.append(zero_wav)
         t4 = ttime()
     print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
+    yield hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(
+        np.int16
+    )
 def split(todo_text):
     todo_text = todo_text.replace("……", "。").replace("——", "，")
     todo_texts = []
     while 1:
         if i_split_head >= len_text:
+            break  # 结尾一定有标点，所以直接跳出即可，最后一段在上次已加入
         if todo_text[i_split_head] in splits:
             i_split_head += 1
             todo_texts.append(todo_text[i_split_tail:i_split_head])
             opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
     else:
         opts = [inp]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
     return "\n".join(opts)
     if tmp_str != "":
         opts.append(tmp_str)
     # print(opts)
+    if len(opts) > 1 and len(opts[-1]) < 50:  ##如果最后一个太短了，和前一个合一起
         opts[-2] = opts[-2] + opts[-1]
         opts = opts[:-1]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
     return "\n".join(opts)
 def cut3(inp):
     inp = inp.strip("\n")
+    opts = ["%s" % item for item in inp.strip("。").split("。")]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return  "\n".join(opts)
 def cut4(inp):
     inp = inp.strip("\n")
+    opts = ["%s" % item for item in inp.strip(".").split(".")]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
 # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
 def cut5(inp):
     inp = inp.strip("\n")
+    punds = {',', '.', ';', '?', '!', '、', '，', '。', '？', '！', ';', '：', '…'}
+    mergeitems = []
+    items = []
+    for i, char in enumerate(inp):
+        if char in punds:
+            if char == '.' and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit():
+                items.append(char)
+            else:
+                items.append(char)
+                mergeitems.append("".join(items))
+                items = []
+        else:
+            items.append(char)
+    if items:
+        mergeitems.append("".join(items))
+    opt = [item for item in mergeitems if not set(item).issubset(punds)]
+    return "\n".join(opt)
 def custom_sort_key(s):
     parts = [int(part) if part.isdigit() else part for part in parts]
     return parts
+def process_text(texts):
+    _text=[]
+    if all(text in [None, " ", "\n",""] for text in texts):
+        raise ValueError(i18n("请输入有效文���"))
+    for text in texts:
+        if text in  [None, " ", ""]:
+            pass
         else:
+            _text.append(text)
+    return _text
+def replace_consecutive_punctuation(text):
+    punctuations = ''.join(re.escape(p) for p in punctuation)
+    pattern = f'([{punctuations}])([{punctuations}])+'
+    result = re.sub(pattern, r'\1', text)
+    return result
+def change_choices():
+    SoVITS_names, GPT_names = get_weights_names()
+    return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"}
+pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
+pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+SoVITS_weight_root = "SoVITS_weights"
+GPT_weight_root = "GPT_weights"
+os.makedirs(SoVITS_weight_root, exist_ok=True)
+os.makedirs(GPT_weight_root, exist_ok=True)
+def get_weights_names():
+    SoVITS_names = [pretrained_sovits_name]
+    for name in os.listdir(SoVITS_weight_root):
+        if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name))
+    GPT_names = [pretrained_gpt_name]
+    for name in os.listdir(GPT_weight_root):
+        if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (GPT_weight_root, name))
+    return SoVITS_names, GPT_names
+SoVITS_names, GPT_names = get_weights_names()
+with gr.Blocks(title="GPT-SoVITS WebUI") as app:
+    gr.Markdown(
+        value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>LICENSE</b>.")
+    )
+    with gr.Group():
+        gr.Markdown(value=i18n("模型切换"))
+        with gr.Row():
+            GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True)
+            SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True)
+            refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
+            refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
+            SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown], [])
+            GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
+        gr.Markdown(value=i18n("*请上传并填写参考信息"))
+        with gr.Row():
+            inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频，超过会报错！"), type="filepath")
+            with gr.Column():
+                ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
+                gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT，听不清参考音频说的啥(不晓得写啥)可以开，开启后无视填写的参考文本。"))
+                prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="")
+            prompt_language = gr.Dropdown(
+                label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
+            )
+        gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
+        with gr.Row():
+            text = gr.Textbox(label=i18n("需要合成的文本"), value="")
+            text_language = gr.Dropdown(
+                label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
+            )
+            how_to_cut = gr.Radio(
+                label=i18n("怎么切"),
+                choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
+                value=i18n("凑四句一切"),
+                interactive=True,
+            )
+            with gr.Row():
+                gr.Markdown(value=i18n("gpt采样参数(无参考文本时不要太低)："))
+                top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
+                top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
+                temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
+            inference_button = gr.Button(i18n("合成语音"), variant="primary")
+            output = gr.Audio(label=i18n("输出的语音"))
+        inference_button.click(
+            get_tts_wav,
+            [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free],
+            [output],
+        )
+        gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好，所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
+        with gr.Row():
+            text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="")
+            button1 = gr.Button(i18n("凑四句一切"), variant="primary")
+            button2 = gr.Button(i18n("凑50字一切"), variant="primary")
+            button3 = gr.Button(i18n("按中文句号。切"), variant="primary")
+            button4 = gr.Button(i18n("按英文句号.切"), variant="primary")
+            button5 = gr.Button(i18n("按标点符号切"), variant="primary")
+            text_opt = gr.Textbox(label=i18n("切分后文本"), value="")
+            button1.click(cut1, [text_inp], [text_opt])
+            button2.click(cut2, [text_inp], [text_opt])
+            button3.click(cut3, [text_inp], [text_opt])
+            button4.click(cut4, [text_inp], [text_opt])
+            button5.click(cut5, [text_inp], [text_opt])
+        gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
+if __name__ == '__main__':
+    app.queue(concurrency_count=511, max_size=1022).launch(
+        server_name="0.0.0.0",
+        inbrowser=True,
+        share=is_share,
+        server_port=infer_ttswebui,
+        quiet=True,
+    )