import io import os #os.system("wget -P hubert/ https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so/resolve/main/hubert/checkpoint_best_legacy_500.pt") import gradio as gr import librosa import numpy as np import soundfile from inference.infer_tool import Svc import logging import time logging.getLogger('numba').setLevel(logging.WARNING) logging.getLogger('markdown_it').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) logging.getLogger('matplotlib').setLevel(logging.WARNING) model = Svc("logs/44k/@github-NGZ-sovits-4.pth", "configs/config-65.json", cluster_model_path="logs/44k/kmeans_10000.pt") #model = Svc("logs/44k/@github-NGZ-sovits-4.pth", "configs/config.json") from matplotlib import pyplot as plt def f0_to_pitch(ff): f0_pitch = 69 + 12 * np.log2(ff / 160) return f0_pitch def compute_f0(wav_file1, wav_file2,tran): y1, sr1 = librosa.load(wav_file1, sr=16000) y2, sr2 = librosa.load(wav_file2, sr=16000) # Compute the f0 using the YIN pitch estimation method f0_1 = librosa.core.yin(y1, fmin=70, fmax=600) f0_2 = librosa.core.yin(y2, fmin=70, fmax=600) # 半音偏差 sum_y = [] if np.sum(wav_file1 == 0) / len(wav_file1) > 0.9: mistake, var_take = 0, 0 else: for i in range(min(len(f0_1), len(f0_2))): if f0_1[i] > 0 and f0_2[i] > 0: sum_y.append( abs(f0_to_pitch(f0_2[i]) - (f0_to_pitch(f0_1[i]) + tran))) num_y = 0 for x in sum_y: num_y += x len_y = len(sum_y) if len(sum_y) else 1 mistake = round(float(num_y / len_y), 2) var_take = round(float(np.std(sum_y, ddof=1)), 2) print("mistake", mistake, var_take) return f0_1, f0_2, sr1, sr2, round(mistake / 10, 2), round(var_take / 10, 2) def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,F0_mean_pooling): #cluster_ratio =0 start_time = time.time() if input_audio is None: return "You need to upload an audio", None sampling_rate, audio = input_audio duration = audio.shape[0] / sampling_rate if duration > 70: return "请上传小于70s的音频,需要转换长音频请本地进行转换", None , None audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) if sampling_rate != 16000: audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) #print(audio.shape) out_wav_path = "temp.wav" soundfile.write(out_wav_path, audio, 16000, format="wav") #print(slice_db, cluster_ratio, auto_f0, noise_scale, sid) print(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale, F0_mean_pooling) _audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,F0_mean_pooling=F0_mean_pooling) soundfile.write("output.wav", _audio, 44100, format="wav") f01, f02, sr1, sr2 , mistake ,var = compute_f0('temp.wav', 'output.wav',vc_transform) time_step_1 = np.arange(0,len(f01) ) time_step_2 = np.arange(0,len(f02) ) plt.figure(figsize=[8, 3]) plt.plot(time_step_1 , f01, label='Input') plt.plot(time_step_2 , f02, label='Output') #plt.title("T0 of Input and Output") #plt.ylabel("T0") #plt.xlabel("Time step") length = np.arange(0,int( duration*10) ,int( duration)) plt.xticks(np.linspace(0, len(f01),len(length)), length) plt.legend() plt.savefig('temp.svg') plt.close() used_time = round(time.time() - start_time, 2) out_str = ("Success! total use time:{}s\n半音偏差:{}\n半音方差:{}".format( used_time, mistake, var)) return out_str , (44100, _audio), gr.Image.update("temp.svg") app = gr.Blocks() with app: with gr.Tabs(): with gr.TabItem("Basic"): gr.Markdown(value=""" # 前言 * 此模型为sovits4.0原版(抗混响强),如果音色不像可以试试另一个模型:[https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so-dev](https://huggingface.co/spaces/Nogizaka46/Nogizaka46-so-dev) * 23-05-29修复池化功能,有bug记得反馈下。模型更新日期23-04-26.新模型使用65小时语音训练63位成员。仅供个人娱乐和非商业用途,禁止用于血腥、暴力、性相关、政治相关内容,转换长音频请本地进行转换 * 扒干声教程:[BV1sb411o7xF](https://www.bilibili.com/video/BV1sb411o7xF) [cv23095265](https://www.bilibili.com/read/cv23095265) b站传播的Ai翻唱大多数是他人翻唱或原曲混响和声少的,不是所有歌都能扒干净的,如果声音不像都是因为混响与和声扒不干净,结合自己的时间学会放弃。更多相关教程,翻唱,本地整合包在Tg群:[t.me/+vP8NK1NMLiYzMDJl](https://t.me/+vP8NK1NMLiYzMDJl) * [Ripx,Au,UVR工具下载](https://pan.baidu.com/s/1Ne55iKqoacjKE-moK_YtGg?pwd=qsfd) 总有问制作流程,这说一下。。以冬之花为例,1.用UVR-4_HP-Vocal模型提取人声 或 vocalremover.org(这个网站处理不会损伤人声,方便二次处理,推荐),UVR-5_HP-Karaoke去除和声,2.合成,对比干声听听有几处哑音 如果有,使用RipX去除干声里造成哑音的和声 4.合成再听听,再不行就使用池化 5.使用Au调音,按喜好,添加混响,和声,回声等,这步可以增强音色,效果是很明显的。通过冬之花的练习,你已经具备处理干声的能力,轻松一天量产10首。 # 声明 * 如用此模型制作音频请标注来源:github.com/3110asuka/Nogizaka46-so 或 huggingface.co/spaces/Nogizaka46/Nogizaka46-so""") gr.Markdown(value="""秋元真夏 AKIMOTO_MANATSU| 生田絵梨花 IKUTA_ERIKA| 生駒里奈 IKOMA_RINA| 伊藤純奈 ITO_JUNNA| 井上小百合 INOUE_SAYURI| 衛藤美彩 ETO_MISA| 川後陽菜 KAWAGO_HINA|北野日奈子 KITANO_HINAKO|齋藤飛鳥 SAITO_ASUKA|斉藤優里 SATO_YUURI|相楽伊織 SAGARA_IORI|桜井玲香 SAKURAI_REIKA|佐々木琴子 SASAKI_KOTOKO|白石麻衣 SHIRAISHI_MAI|新内眞衣 SHINUCHI_MAI|鈴木絢音 SUZUKI_AYANE|高山一実 TAKAYAMA_KAZUMI|寺田蘭世 TERADA_RANZE|西野七瀬 NISHINO_NANASE|能條愛未 NOUJO_AMI|樋口日奈 HIGUCHI_HINA|星野みなみ HOSHINO_MINAMI|堀未央奈 HORI_MIONA|松村沙友理 MATSUMURA_SAYURI|山崎怜奈 YAMAZAKI_RENA|若月佑美 WAKATSUKI_YUMI|渡辺みり愛 WATANABE_MIRIA|和田まあや WADA_MAAYA|伊藤理々杏 ITO_RIRIA|岩本蓮加 IWAMOTO_RENKA|梅澤美波 UMEZAWA_MINAMI|大園桃子 OZONO_MOMOKO|久保史緒里 KUBO_SHIORI|阪口珠美 SAKAGUCHI_TAMAMI|佐藤楓 SATO_KAEDE|中村麗乃 NAKAMURA_RENO|向井葉月 MUKAI_HAZUKI|山下美月 YAMASHITA_MIZUKI|与田祐希 YODA_YUUKI|遠藤さくら ENDO_SAKURA|賀喜遥香 KAKI_HARUKA|掛橋沙耶香 KAKEHASHI_SAYAKA|金川紗耶 KANAGAWA_SAYA|北川悠理 KITAGAWA_YURI|柴田柚菜 SHIBATA_YUNA|田村真佑 TAMURA_MAYU|筒井あやめ TSUTSUI_AYAME|早川聖来 HAYAKAWA_SEIRA|矢久保美緒 YAKUBO_MIO|黒見明香 HARUKA_KUROMI|佐藤璃果 RIKA_SATO|林瑠奈 RUNA_HAYASHI|松尾美佑 MIYU_MATSUO|弓木奈於 NAO_YUMIKI|五百城茉央 IOKI_MAO|池田瑛紗 IKEDA_TERESA|一ノ瀬美空 ICHINOSE_MIKU|井上和 INOUE_NAGI|小川彩 OGAWA_AYA|奥田いろは OKUDA_IROHA|川﨑桜 KAWASAKI_SAKURA|菅原咲月 SUGAWARA_SATSUKI|冨里奈央 TOMISATO_NAO|中西アルノ NAKANISHI_ARUNO""") spks = list(model.spk2id.keys()) sid = gr.Dropdown(label="音色", choices=spks, value="IKUTA_ERIKA") vc_input3 = gr.Audio(label="上传音频<70s无BGM无混响的干声",value="没什么「你的名字。」干声素材12s.mp3") #vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)一般填写6或-6之内", value=0) vc_transform = gr.Slider(label="变调(整数,可以正负,半音数量,升高八度就是12)一般填写6或-6之内", maximum=16, minimum=-16, step=1, value=0) cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0) auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False) slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50,一般默认-40)", maximum=-30, minimum=-70, step=1, value=-40) noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4) F0_mean_pooling = gr.Checkbox(label="是否对F0使用均值滤波器(池化),对部分哑音有改善(和声混响造成的哑音无效)。注意,会导致推理速度下降,默认关闭", value=False) vc_submit = gr.Button("转换", variant="primary") vc_output1 = gr.Textbox(label="音高平均偏差半音数量,体现转换音频的跑调情况(一般小于0.5)") vc_output2 = gr.Audio(label="Output Audio") f0_image = gr.Image(label="f0曲线,蓝色为输入音高,橙色为合成音频的音高(代码有误差)") vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,F0_mean_pooling], [vc_output1, vc_output2, f0_image]) app.launch()