Spaces:
Runtime error
Runtime error
import io | |
import os | |
#os.system("wget -P hubert/ https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so/resolve/main/hubert/checkpoint_best_legacy_500.pt") | |
import gradio as gr | |
import librosa | |
import numpy as np | |
import soundfile | |
from inference.infer_tool import Svc | |
import logging | |
import time | |
logging.getLogger('numba').setLevel(logging.WARNING) | |
logging.getLogger('markdown_it').setLevel(logging.WARNING) | |
logging.getLogger('urllib3').setLevel(logging.WARNING) | |
logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
model = Svc("logs/44k/@github-NGZ-sovits-4.pth", "configs/config-65.json", cluster_model_path="logs/44k/kmeans_10000.pt") | |
#model = Svc("logs/44k/@github-NGZ-sovits-4.pth", "configs/config.json") | |
from matplotlib import pyplot as plt | |
def f0_to_pitch(ff): | |
f0_pitch = 69 + 12 * np.log2(ff / 160) | |
return f0_pitch | |
def compute_f0(wav_file1, wav_file2,tran): | |
y1, sr1 = librosa.load(wav_file1, sr=16000) | |
y2, sr2 = librosa.load(wav_file2, sr=16000) | |
# Compute the f0 using the YIN pitch estimation method | |
f0_1 = librosa.core.yin(y1, fmin=70, fmax=600) | |
f0_2 = librosa.core.yin(y2, fmin=70, fmax=600) | |
# 半音偏差 | |
sum_y = [] | |
if np.sum(wav_file1 == 0) / len(wav_file1) > 0.9: | |
mistake, var_take = 0, 0 | |
else: | |
for i in range(min(len(f0_1), len(f0_2))): | |
if f0_1[i] > 0 and f0_2[i] > 0: | |
sum_y.append( | |
abs(f0_to_pitch(f0_2[i]) - (f0_to_pitch(f0_1[i]) + tran))) | |
num_y = 0 | |
for x in sum_y: | |
num_y += x | |
len_y = len(sum_y) if len(sum_y) else 1 | |
mistake = round(float(num_y / len_y), 2) | |
var_take = round(float(np.std(sum_y, ddof=1)), 2) | |
print("mistake", mistake, var_take) | |
return f0_1, f0_2, sr1, sr2, round(mistake / 10, 2), round(var_take / 10, 2) | |
def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,F0_mean_pooling): | |
#cluster_ratio =0 | |
start_time = time.time() | |
if input_audio is None: | |
return "You need to upload an audio", None | |
sampling_rate, audio = input_audio | |
duration = audio.shape[0] / sampling_rate | |
if duration > 70: | |
return "请上传小于70s的音频,需要转换长音频请本地进行转换", None , None | |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
if len(audio.shape) > 1: | |
audio = librosa.to_mono(audio.transpose(1, 0)) | |
if sampling_rate != 16000: | |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) | |
#print(audio.shape) | |
out_wav_path = "temp.wav" | |
soundfile.write(out_wav_path, audio, 16000, format="wav") | |
#print(slice_db, cluster_ratio, auto_f0, noise_scale, sid) | |
print(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale, F0_mean_pooling) | |
_audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,F0_mean_pooling=F0_mean_pooling) | |
soundfile.write("output.wav", _audio, 44100, format="wav") | |
f01, f02, sr1, sr2 , mistake ,var = compute_f0('temp.wav', 'output.wav',vc_transform) | |
time_step_1 = np.arange(0,len(f01) ) | |
time_step_2 = np.arange(0,len(f02) ) | |
plt.figure(figsize=[8, 3]) | |
plt.plot(time_step_1 , f01, label='Input') | |
plt.plot(time_step_2 , f02, label='Output') | |
#plt.title("T0 of Input and Output") | |
#plt.ylabel("T0") | |
#plt.xlabel("Time step") | |
length = np.arange(0,int( duration*10) ,int( duration)) | |
plt.xticks(np.linspace(0, len(f01),len(length)), length) | |
plt.legend() | |
plt.savefig('temp.svg') | |
plt.close() | |
used_time = round(time.time() - start_time, 2) | |
out_str = ("Success! total use time:{}s\n半音偏差:{}\n半音方差:{}".format( | |
used_time, mistake, var)) | |
return out_str , (44100, _audio), gr.Image.update("temp.svg") | |
app = gr.Blocks() | |
with app: | |
with gr.Tabs(): | |
with gr.TabItem("Basic"): | |
gr.Markdown(value=""" | |
# 前言 | |
* 此模型为sovits4.0原版(抗混响强),如果音色不像可以试试另一个模型:[https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so-dev](https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so-dev) | |
* 23-05-29修复池化功能,有bug记得反馈下。模型更新日期23-04-26.新模型使用65小时语音训练63位成员。仅供个人娱乐和非商业用途,禁止用于血腥、暴力、性相关、政治相关内容,转换长音频请本地进行转换 | |
* 扒干声教程:[BV1sb411o7xF](https://www.bilibili.com/video/BV1sb411o7xF) [cv23095265](https://www.bilibili.com/read/cv23095265) b站传播的Ai翻唱大多数是他人翻唱或原曲混响和声少的,不是所有歌都能扒干净的,如果声音不像都是因为混响与和声扒不干净,结合自己的时间学会放弃。更多相关教程,翻唱,本地整合包在Tg群:[t.me/+vP8NK1NMLiYzMDJl](https://t.me/+vP8NK1NMLiYzMDJl) | |
* [Ripx,Au,UVR工具下载](https://pan.baidu.com/s/1Ne55iKqoacjKE-moK_YtGg?pwd=qsfd) 总有问制作流程,这说一下。。以冬之花为例,1.用UVR-4_HP-Vocal模型提取人声 或 vocalremover.org(这个网站处理不会损伤人声,方便二次处理,推荐),UVR-5_HP-Karaoke去除和声,2.合成,对比干声听听有几处哑音 如果有,使用RipX去除干声里造成哑音的和声 4.合成再听听,再不行就使用池化 5.使用Au调音,按喜好,添加混响,和声,回声等,这步可以增强音色,效果是很明显的。通过冬之花的练习,你已经具备处理干声的能力,轻松一天量产10首。 | |
# 声明 | |
* 如用此模型制作音频请标注来源:github.com/3110asuka/Nogizaka46-so 或 huggingface.co/spaces/Nogizaka46/Nogizaka46-so""") | |
gr.Markdown(value="""秋元真夏 AKIMOTO_MANATSU| 生田絵梨花 IKUTA_ERIKA| 生駒里奈 IKOMA_RINA| 伊藤純奈 ITO_JUNNA| 井上小百合 INOUE_SAYURI| 衛藤美彩 ETO_MISA| 川後陽菜 KAWAGO_HINA|北野日奈子 KITANO_HINAKO|齋藤飛鳥 SAITO_ASUKA|斉藤優里 SATO_YUURI|相楽伊織 SAGARA_IORI|桜井玲香 SAKURAI_REIKA|佐々木琴子 SASAKI_KOTOKO|白石麻衣 SHIRAISHI_MAI|新内眞衣 SHINUCHI_MAI|鈴木絢音 SUZUKI_AYANE|高山一実 TAKAYAMA_KAZUMI|寺田蘭世 TERADA_RANZE|西野七瀬 NISHINO_NANASE|能條愛未 NOUJO_AMI|樋口日奈 HIGUCHI_HINA|星野みなみ HOSHINO_MINAMI|堀未央奈 HORI_MIONA|松村沙友理 MATSUMURA_SAYURI|山崎怜奈 YAMAZAKI_RENA|若月佑美 WAKATSUKI_YUMI|渡辺みり愛 WATANABE_MIRIA|和田まあや WADA_MAAYA|伊藤理々杏 ITO_RIRIA|岩本蓮加 IWAMOTO_RENKA|梅澤美波 UMEZAWA_MINAMI|大園桃子 OZONO_MOMOKO|久保史緒里 KUBO_SHIORI|阪口珠美 SAKAGUCHI_TAMAMI|佐藤楓 SATO_KAEDE|中村麗乃 NAKAMURA_RENO|向井葉月 MUKAI_HAZUKI|山下美月 YAMASHITA_MIZUKI|与田祐希 YODA_YUUKI|遠藤さくら ENDO_SAKURA|賀喜遥香 KAKI_HARUKA|掛橋沙耶香 KAKEHASHI_SAYAKA|金川紗耶 KANAGAWA_SAYA|北川悠理 KITAGAWA_YURI|柴田柚菜 SHIBATA_YUNA|田村真佑 TAMURA_MAYU|筒井あやめ TSUTSUI_AYAME|早川聖来 HAYAKAWA_SEIRA|矢久保美緒 YAKUBO_MIO|黒見明香 HARUKA_KUROMI|佐藤璃果 RIKA_SATO|林瑠奈 RUNA_HAYASHI|松尾美佑 MIYU_MATSUO|弓木奈於 NAO_YUMIKI|五百城茉央 IOKI_MAO|池田瑛紗 IKEDA_TERESA|一ノ瀬美空 ICHINOSE_MIKU|井上和 INOUE_NAGI|小川彩 OGAWA_AYA|奥田いろは OKUDA_IROHA|川﨑桜 KAWASAKI_SAKURA|菅原咲月 SUGAWARA_SATSUKI|冨里奈央 TOMISATO_NAO|中西アルノ NAKANISHI_ARUNO""") | |
spks = list(model.spk2id.keys()) | |
sid = gr.Dropdown(label="音色", choices=spks, value="IKUTA_ERIKA") | |
vc_input3 = gr.Audio(label="上传音频<70s无BGM无混响的干声",value="没什么「你的名字。」干声素材12s.mp3") | |
#vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)一般填写6或-6之内", value=0) | |
vc_transform = gr.Slider(label="变调(整数,可以正负,半音数量,升高八度就是12)一般填写6或-6之内", | |
maximum=16, minimum=-16, step=1, value=0) | |
cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0) | |
auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False) | |
slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50,一般默认-40)", | |
maximum=-30, minimum=-70, step=1, value=-40) | |
noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4) | |
F0_mean_pooling = gr.Checkbox(label="是否对F0使用均值滤波器(池化),对部分哑音有改善(和声混响造成的哑音无效)。注意,会导致推理速度下降,默认关闭", value=False) | |
vc_submit = gr.Button("转换", variant="primary") | |
vc_output1 = gr.Textbox(label="音高平均偏差半音数量,体现转换音频的跑调情况(一般小于0.5)") | |
vc_output2 = gr.Audio(label="Output Audio") | |
f0_image = gr.Image(label="f0曲线,蓝色为输入音高,橙色为合成音频的音高(代码有误差)") | |
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,F0_mean_pooling], | |
[vc_output1, vc_output2, f0_image]) | |
app.launch() |