Spaces:
Running
Running
import io | |
import os | |
import gradio as gr | |
import librosa | |
import base64 | |
import numpy as np | |
import soundfile | |
#from inference.infer_tool import Svc | |
from inference.infer_tool import Svc | |
import logging | |
import time | |
from tts_voices import SUPPORTED_LANGUAGES | |
logging.getLogger('numba').setLevel(logging.WARNING) | |
logging.getLogger('markdown_it').setLevel(logging.WARNING) | |
logging.getLogger('urllib3').setLevel(logging.WARNING) | |
logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
#hf_token = os.environ.get('TOKEN') | |
#hf_token1 = os.environ.get('TOKEN1') | |
#hf_token2 = os.environ.get('TOKEN2') | |
#hf_token_config = os.environ.get('TOKEN_config') | |
from matplotlib import pyplot as plt | |
import datetime | |
import subprocess | |
def tts_fn(_text, _gender, _lang, _rate, _volume, sid, vc_transform, auto_f0,cluster_ratio, slice_db, f0_predictor): | |
if len( _text) > 400: | |
return "请上传小于200字的文本", None | |
try: | |
_rate = f"+{int(_rate*100)}%" if _rate >= 0 else f"{int(_rate*100)}%" | |
_volume = f"+{int(_volume*100)}%" if _volume >= 0 else f"{int(_volume*100)}%" | |
if _lang == "Auto": | |
_gender = "Male" if _gender == "男" else "Female" | |
subprocess.run([r"python", "tts.py", _text, _lang, _rate, _volume, _gender]) | |
else: | |
subprocess.run([r"python", "tts.py", _text, _lang, _rate, _volume]) | |
input_audio = "tts.wav" | |
audio, sampling_rate = soundfile.read(input_audio) | |
if np.issubdtype(audio.dtype, np.integer): | |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
if len(audio.shape) > 1: | |
audio = librosa.to_mono(audio.transpose(1, 0)) | |
if sampling_rate != 44100: | |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100) | |
soundfile.write(input_audio, audio, 44100, format="wav") | |
output_file_path = "tts_output.mp3" | |
_audio = model.slice_inference(input_audio, sid, vc_transform, slice_db, cluster_ratio, auto_f0, 0.4,f0_predictor=f0_predictor,clip_seconds=40) | |
print (_text, _gender, _lang, _rate, _volume, sid, vc_transform, auto_f0,cluster_ratio, slice_db, f0_predictor) | |
soundfile.write("tts_output.mp3", _audio, 44100, format="mp3") | |
return "Success", output_file_path | |
except Exception as e: | |
print(e) | |
def f0_to_pitch(ff): | |
f0_pitch = 69 + 12 * np.log2(ff / 441) | |
return f0_pitch | |
def compute_f0(wav_file1, wav_file2,tran): | |
y1, sr1 = librosa.load(wav_file1, sr=44100) | |
y2, sr2 = librosa.load(wav_file2, sr=44100) | |
# Compute the f0 using the YIN pitch estimation method | |
f0_1 = librosa.core.yin(y1, fmin=1, fmax=400) | |
f0_2 = librosa.core.yin(y2, fmin=1, fmax=400) | |
# 半 音 偏差 | |
sum_y = [] | |
if np.sum(wav_file1 == 0) / len(wav_file1) > 0.9: | |
mistake, var_take = 0, 0 | |
else: | |
for i in range(min(len(f0_1), len(f0_2))): | |
if f0_1[i] > 0 and f0_2[i] > 0: | |
sum_y.append( | |
abs(f0_to_pitch(f0_2[i]) - (f0_to_pitch(f0_1[i]) + tran))) | |
num_y = 0 | |
for x in sum_y: | |
num_y += x | |
len_y = len(sum_y) if len(sum_y) else 1 | |
mistake = round(float(num_y / len_y), 2) | |
var_take = round(float(np.std(sum_y, ddof=1)), 2) | |
print("mistake", mistake, var_take) | |
return f0_1, f0_2, sr1, sr2, round(mistake / 10, 2), round(var_take / 10, 2) | |
def same_auth(username, password): | |
now = datetime.datetime.utcnow() + datetime.timedelta(hours=8) | |
print(username, password,now.strftime("%Y-%m-%d %H:%M:%S")) | |
username = username.replace("https://","").replace("http://","").replace("/","") | |
return username == base64.b64decode( b'c292aXRzNC5ub2dpemFrYTQ2LmNj' ).decode() or username == base64.b64decode( b'c292aXRzNC1kZXYubm9naXpha2E0Ni5jYw==' ).decode() or password == base64.b64decode( b'c292aXRzNC1kZXYubm9naXpha2E0Ni5jYw==' ).decode() or password == base64.b64decode( b'c292aXRzNC5ub2dpemFrYTQ2LmNj' ).decode() | |
def vc_fn(output_format,sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db,f0_predictor,clip_seconds=50): | |
start_time = time.time() | |
if input_audio is None: | |
return "You need to upload an audio ", None | |
audio, sampling_rate = soundfile.read(input_audio) | |
duration = audio.shape[0] / sampling_rate | |
if duration > 280: | |
return "请上传小于280s的音频,需要转换长音频请使用tgbot", None , None | |
if np.issubdtype(audio.dtype, np.integer): | |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
if len(audio.shape) > 1: | |
audio = librosa.to_mono(audio.transpose(1, 0)) | |
if sampling_rate != 44100: | |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100) | |
out_wav_path = "temp.wav" | |
soundfile.write(out_wav_path, audio, 44100, format="wav") | |
now = datetime.datetime.utcnow() + datetime.timedelta(hours=8) | |
print(sid, vc_transform, auto_f0,cluster_ratio, slice_db,f0_predictor,now.strftime("%Y-%m-%d %H:%M:%S")) | |
_audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, 0.4,f0_predictor=f0_predictor,clip_seconds=clip_seconds,loudness_envelope_adjustment = 0) | |
out_wav_path1 = 'output_'+f'{sid}_{vc_transform}.{output_format}' | |
soundfile.write(out_wav_path1, _audio, 44100, format=output_format) | |
used_time = round(time.time() - start_time, 2) | |
out_str = ("Success! total use time:{}s".format(used_time)) | |
return out_str ,out_wav_path1 | |
def change_audio(audio,vc): | |
new_audio = audio | |
return new_audio,vc | |
def loadmodel(model_): | |
global model | |
model_name = os.path.splitext(os.path.basename(model_))[0] | |
model.unload_model() | |
if os.path.exists("./kmeans/" + model_name + ".pt") == True: | |
model = Svc(model_, "configs/" + model_name + ".json", cluster_model_path="./kmeans/" + model_name + ".pt") | |
else: | |
model = Svc(model_, "configs/" + model_name + ".json") | |
spks = list(model.spk2id.keys()) | |
print(model_, "configs/" + model_name + ".json", "./kmeans/" + model_name + ".pt") | |
return update_dropdown(spks) | |
def update_dropdown(new_choices): | |
global model | |
spks = list(model.spk2id.keys()) | |
new_choices = gr.Dropdown.update(choices=spks) | |
return new_choices | |
sid ="" | |
import pyzipper | |
hf_token1 = os.getenv("TOKEN1").encode("utf-8") | |
with pyzipper.AESZipFile('./N.zip') as zf: | |
zf.pwd = hf_token1 | |
zf.extractall() | |
with pyzipper.AESZipFile('./N_2.zip') as zf: | |
zf.pwd = hf_token1 | |
zf.extractall() | |
model = Svc("./N/44.pth", "configs/44.json" , cluster_model_path="./kmeans/44.pt") | |
modelPaths = [] | |
for dirpath, dirnames, filenames in os.walk("./N/"): | |
for filename in filenames: | |
modelPaths.append(os.path.join(dirpath, filename)) | |
app = gr.Blocks(theme='NoCrypt/miku') | |
with app: | |
with gr.Tabs(): | |
with gr.TabItem(" "): | |
#gr.Markdown( | |
#'<div align="center">' | |
#f'<img style="width:auto;height:300px;" src="cover.png">' | |
#'</div>') | |
gr.Markdown(value=base64.b64decode( b'ICAgICAgICAgICAgICAgICAgICAjIOWJjeiogAogICAgICAgICAgICAgICAgICAgICogKOWIneasoeS9v+eUqOi+g+aFou+8jOe9kee7nOi+g+WlveaDheWGteS4i+Wkp+amgjIwc+WkhOeQhjEwc+mfs+mikeOAguWrjOaFouivt+S9v+eUqOacrOWcsOaOqOeQhinvvIzlu7rorq7kvb/nlKjku6PnkIYu5pu05paw5LqOMjMtMTEtMjcsU292aXRz5pWw5o2u6ZuG6KaB5rGCMjDliIbpkp/ku6XkuIrpn7PoibLkuIDoh7TmgKfvvIzlrp7pmYXmlLbpm4blvojpmr7vvIzpg6jliIbmiJDlkZjnlLHkuI3lkIzlo7Dnur/mt7flkIjlh7rlrozlhajkuI3lg4/nmoTpn7PoibLjgILku4XkvpvkuKrkurrlqLHkuZDlkozpnZ7llYbkuJrnlKjpgJTvvIznpoHmraLnlKjkuo7ooYDohaXjgIHmmrTlipvjgIHmgKfnm7jlhbPjgIHmlL/msrvnm7jlhbPlhoXlrrk=').decode()) | |
with gr.Tabs(): | |
with gr.TabItem("单个音频上传"): | |
vc_input3 = gr.Audio(label="上传音频<280s无BGM无和声的干声", type="filepath" ,source="upload",value="examples/1.mp3") | |
with gr.TabItem("文字转语音(实验性)"): | |
gr.Markdown("文字转语音(TTS)说明:使用edge_tts服务生成音频,并转换为So-VITS模型音色。") | |
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False) | |
with gr.Row(): | |
text_input = gr.Textbox(label = "在此输入需要转译的文字(建议打开自动f0预测)限定200字以内,可尝试不同f0预测器") | |
with gr.Row(): | |
tts_gender = gr.Radio(label = "说话人性别", choices = ["男","女"], value = "女") | |
tts_lang = gr.Dropdown(label = "选择语言,Auto为根据输入文字自动识别", choices=SUPPORTED_LANGUAGES, value = "Auto") | |
with gr.Row(): | |
tts_rate = gr.Slider(label = "TTS语音变速(倍速相对值)", minimum = -1, maximum = 3, value = 0, step = 0.1) | |
tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = -1, maximum = 1.5, value = 0, step = 0.1) | |
vc_tts_submit = gr.Button("文本转语音", variant="primary") | |
with gr.TabItem("使用方法说明:"): | |
gr.Markdown(value=base64.b64decode( b'5pys5Zyw5L2/55So5pa55rOVOgpTb3ZpdHPlj6rpgILlkIjllLHmrYzvvIzlpoLmnpznlKjkuo5UVFPvvIjor7flsJ3or5VHUFRzb3ZpdHPvvInjgII0NHYx5qih5Z6L6Z+z6LSo5p6B5L2z77yM6Iez5bCRMjAr5oiQ5ZGY57uP6L+H6LCD5pWZ77yM55u45Ly85bqm5L+d6K+BCuacrOWcsOaOqOeQhuaooeWei+S4i+i9veW3suW8gOaUvu+8muWOu1RH6aKR6YGT5LiL6L29OltodHRwczovL3QubWUvK0pPQTNLZklTOUZSbVpXWmxdKGh0dHBzOi8vdC5tZS8rSk9BM0tmSVM5RlJtWldabCkK5o+Q5L6b5LqG5LiA5Lqb5pys5qih5Z6L55qE5q2M5puy5pWI5p6c5ryU56S677yM5Lmf5Y+R5YiwVEfpopHpgZPkuobvvIzlpoLmnpzml6Dms5XmlLbliLB0Z+efreS/oe+8jOivt+ivleivlXRlbGVncmFtIFggYXBwCgrliLbkvZzlgbblg4/llLHmrYzlhajmtYHnqIvvvIxi56uZ5pCc57SiIFNvdml0cyDmnInorrjlpJrmlZnnqIvvvIzot5/nnYDmk43kvZzljbPlj68KCuaIkeeahOWIhuemu+W5suWjsOaWueazlee7j+mqjO+8mgrmnIDmlrDov57mi5vvvIjlubLlo7DvvInvvJoKMS4gLuWIhuemu+S6uuWjsOOAkOWPr+ebtOaOpemAiTPmiJY077yM5L2G5Y+v6IO95o2f5Lyk5Lq65aOw44CR77yaCiDCoCDCoFVWUi1NRFgyM0MtSW5zdFZvYyBIUSAo6ZyA5YaN6LeRM19IUC1Wb2NhbC1VVlIpCiDCoCDCoERlbXVjc1YzIOaIliBWNCAgKOmcgOWGjei3kTNfSFAtVm9jYWwtVVZSKQogwqAgwqAzX0hQLVZvY2FsLVVWUgogwqAgwqBVVlItTURYLU5FVCBNYWluCgoyLiDlpoLmnpzmnInliJnljrvlkozlo7DjgJAz6YCJMe+8jOWTquS4quaViOaenOWlveWwseeUqOWTquS4quOAke+8mgogwqAgwqA1X0hQX0thcmFva2UtVVZSCiDCoCDCoFVWUi1CVkUtNEJfU04tNDQxMDAtMemAiUluc3RydW1lbnRhbCBPbmx5wqAgwqAKIMKgIMKgNl9IUF9LYXJhb2tlLVVWUgogwqAgwqBVVlItTURYLU5FVCBLYXJhb2tlKOi9u+W6puWOu+mZpCzpnIDlpJrmrKEpCgozLiDlpoLmnpzmnInliJnljrvmt7flk43jgJAy6YCJMe+8jOagueaNrua3t+WTjeeahOeoi+W6pumAieaLqeOAke+8mgogwqAgwqBVVlItRGUtRWNoby1Ob3JtYWzpgIlObyBFY2hvIE9ubHnvvIjovbvluqbmt7flk43vvIkKIMKgIMKgVVZSLURlLUVjaG8tQWdncmVzc2l2ZemAiU5vIEVjaG8gT25see+8iOmHjeW6pua3t+WTje+8iQoK5LiN5o6o6I2Q55So5LuY6LS56L2v5Lu244CCdXZyNeWMheWQq+W4gumdouS4iuacgOW8uuW8gOa6kOaooeWei++8jOi9r+S7tuWujOWFqOWFjei0ue+8ge+8ge+8ge+8ge+8mmh0dHBzOi8vdWx0aW1hdGV2b2NhbHJlbW92ZXIuY29tLwrnoa7kv51VVlLmmK/mnIDmlrDniYjmnKzvvJo1LjYuMC7lpoLmnpxVVlLph4zpnaLmsqHkuIrov7DmqKHlnovvvIzngrnlsI/mibPmiYvvvIzljrtEb3dubG9hZCBDZW50ZXLph4zpnaLkuIvovb3mqKHlnovvvIjor7foh6rlpIfmoq/lrZDvvIzlkKbliJnkvJrkuIvovb3lpLHotKXvvIk=').decode()) | |
spks = list(model.spk2id.keys()) | |
sid = gr.Dropdown(label="音色(目前有58个)", choices=spks, value="HOSHINO_MINAMI") | |
sid.change(fn=update_dropdown,inputs=[sid],outputs=[sid]) | |
sid.update(interactive=True) | |
with gr.Accordion(label="↓切换模型(默认44v1,音色具有抽奖性质,可尝试切换。也有特化的个人模型可选择)", open=False): | |
gr.Markdown(value="特殊说明:IKEDA_TERESA个人模型无聚类模型。44位成员的模型是9月份制作的,音质最好。") | |
modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value") | |
btnMod = gr.Button("载入模型") | |
btnMod.click(loadmodel, inputs=[modelstrs], outputs = [sid]) | |
with gr.Row(): | |
slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50)",maximum=-30, minimum=-70, step=1, value=-40) | |
vc_transform = gr.Slider(label="变调(整数,可以正负,半音数量,升高八度就是12)",maximum=16, minimum=-16, step=1, value=0) | |
f0_predictor = gr.Radio(label="f0预测器(如遇哑音可以尝试更换f0)凭干声干净程度选择。只推荐fcpe音色最像或rmvpe音最准", choices=["pm","dio","harvest","fcpe","rmvpe"], value="fcpe") | |
with gr.Row(): | |
cluster_ratio = gr.Slider(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)",maximum=1, minimum=0, step=0.1, value=0) | |
output_format = gr.Radio(label="音频输出格式(MP3会导致时间轴多27ms,需合成请选flac)", choices=["flac", "mp3"], value = "mp3")#格式 | |
vc_submit = gr.Button("音频转换", variant="primary") | |
vc_output1 = gr.Textbox(label="音高平均偏差半音数量,体现转换音频的跑调情况(一般小于0.5)") | |
vc_output2 = gr.Audio(label="Output Audio") | |
vc_submit.click(vc_fn, [output_format,sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db,f0_predictor], [vc_output1, vc_output2]) | |
vc_tts_submit.click(tts_fn, [text_input, tts_gender, tts_lang, tts_rate, tts_volume, sid, vc_transform,auto_f0,cluster_ratio, slice_db, f0_predictor], [vc_output1, vc_output2]) | |
app.launch() | |