|
import json |
|
import os |
|
import subprocess |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import torch |
|
from demucs.apply import apply_model |
|
from demucs.pretrained import DEFAULT_MODEL, get_model |
|
from huggingface_hub import hf_hub_download, list_repo_files |
|
|
|
from so_vits_svc_fork.hparams import HParams |
|
from so_vits_svc_fork.inference.core import Svc |
|
|
|
|
|
|
|
|
|
|
|
|
|
repo_id = "kevinwang676/talktalkai-qing" |
|
|
|
|
|
ckpt_name = None |
|
|
|
|
|
cluster_model_name = None |
|
|
|
|
|
|
|
|
|
default_f0_method = "crepe" |
|
|
|
|
|
|
|
default_cluster_infer_ratio = 0.5 |
|
|
|
|
|
|
|
|
|
duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9)) |
|
|
|
|
|
|
|
|
|
if ckpt_name is None: |
|
latest_id = sorted( |
|
[ |
|
int(Path(x).stem.split("_")[1]) |
|
for x in list_repo_files(repo_id) |
|
if x.startswith("G_") and x.endswith(".pth") |
|
] |
|
)[-1] |
|
ckpt_name = f"G_{latest_id}.pth" |
|
|
|
cluster_model_name = cluster_model_name or "kmeans.pt" |
|
if cluster_model_name in list_repo_files(repo_id): |
|
print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}") |
|
cluster_model_path = hf_hub_download(repo_id, cluster_model_name) |
|
else: |
|
print(f"Could not find {cluster_model_name} in {repo_id}. Using None") |
|
cluster_model_path = None |
|
default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0 |
|
|
|
generator_path = hf_hub_download(repo_id, ckpt_name) |
|
config_path = hf_hub_download(repo_id, "config.json") |
|
hparams = HParams(**json.loads(Path(config_path).read_text())) |
|
speakers = list(hparams.spk.keys()) |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path) |
|
demucs_model = get_model(DEFAULT_MODEL) |
|
|
|
|
|
def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0): |
|
wav, sr = librosa.load(filename, mono=False, sr=sr) |
|
wav = torch.tensor(wav) |
|
ref = wav.mean(0) |
|
wav = (wav - ref.mean()) / ref.std() |
|
sources = apply_model( |
|
model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs |
|
)[0] |
|
sources = sources * ref.std() + ref.mean() |
|
|
|
|
|
vocal_wav = sources[-1] |
|
|
|
vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1) |
|
vocal_wav = vocal_wav.numpy() |
|
vocal_wav = librosa.to_mono(vocal_wav) |
|
vocal_wav = vocal_wav.T |
|
instrumental_wav = sources[:-1].sum(0).numpy().T |
|
return vocal_wav, instrumental_wav |
|
|
|
|
|
def download_youtube_clip( |
|
video_identifier, |
|
start_time, |
|
end_time, |
|
output_filename, |
|
num_attempts=5, |
|
url_base="https://www.youtube.com/watch?v=", |
|
quiet=False, |
|
force=False, |
|
): |
|
output_path = Path(output_filename) |
|
if output_path.exists(): |
|
if not force: |
|
return output_path |
|
else: |
|
output_path.unlink() |
|
|
|
quiet = "--quiet --no-warnings" if quiet else "" |
|
command = f""" |
|
yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501 |
|
""".strip() |
|
|
|
attempts = 0 |
|
while True: |
|
try: |
|
_ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) |
|
except subprocess.CalledProcessError: |
|
attempts += 1 |
|
if attempts == num_attempts: |
|
return None |
|
else: |
|
break |
|
|
|
if output_path.exists(): |
|
return output_path |
|
else: |
|
return None |
|
|
|
|
|
def predict( |
|
speaker, |
|
audio, |
|
transpose: int = 0, |
|
auto_predict_f0: bool = False, |
|
cluster_infer_ratio: float = 0, |
|
noise_scale: float = 0.4, |
|
f0_method: str = "crepe", |
|
db_thresh: int = -40, |
|
pad_seconds: float = 0.5, |
|
chunk_seconds: float = 0.5, |
|
absolute_thresh: bool = False, |
|
): |
|
audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit) |
|
audio = model.infer_silence( |
|
audio.astype(np.float32), |
|
speaker=speaker, |
|
transpose=transpose, |
|
auto_predict_f0=auto_predict_f0, |
|
cluster_infer_ratio=cluster_infer_ratio, |
|
noise_scale=noise_scale, |
|
f0_method=f0_method, |
|
db_thresh=db_thresh, |
|
pad_seconds=pad_seconds, |
|
chunk_seconds=chunk_seconds, |
|
absolute_thresh=absolute_thresh, |
|
) |
|
return model.target_sample, audio |
|
|
|
|
|
def predict_song_from_yt( |
|
ytid_or_url, |
|
start, |
|
end, |
|
speaker=speakers[0], |
|
transpose: int = 0, |
|
auto_predict_f0: bool = False, |
|
cluster_infer_ratio: float = 0, |
|
noise_scale: float = 0.4, |
|
f0_method: str = "dio", |
|
db_thresh: int = -40, |
|
pad_seconds: float = 0.5, |
|
chunk_seconds: float = 0.5, |
|
absolute_thresh: bool = False, |
|
): |
|
end = min(start + duration_limit, end) |
|
original_track_filepath = download_youtube_clip( |
|
ytid_or_url, |
|
start, |
|
end, |
|
"track.wav", |
|
force=True, |
|
url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=", |
|
) |
|
vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath) |
|
if transpose != 0: |
|
inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T |
|
cloned_vox = model.infer_silence( |
|
vox_wav.astype(np.float32), |
|
speaker=speaker, |
|
transpose=transpose, |
|
auto_predict_f0=auto_predict_f0, |
|
cluster_infer_ratio=cluster_infer_ratio, |
|
noise_scale=noise_scale, |
|
f0_method=f0_method, |
|
db_thresh=db_thresh, |
|
pad_seconds=pad_seconds, |
|
chunk_seconds=chunk_seconds, |
|
absolute_thresh=absolute_thresh, |
|
) |
|
full_song = inst_wav + np.expand_dims(cloned_vox, 1) |
|
return (model.target_sample, full_song), (model.target_sample, cloned_vox), (model.target_sample, inst_wav) |
|
|
|
|
|
|
|
image_markdown = (""" |
|
<h1 align="center"><a href="http://www.talktalkai.com"><img src="https://y.qq.com/music/photo_new/T001R300x300M0000025Gr0r2OXvrn_2.jpg", alt="talktalkai" border="0" style="margin: 0 auto; height: 200px;" /></a> </h1> |
|
""") |
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML("<center>" |
|
"<h1>🌊💕🎶 - 滔滔AI+音乐:可从B站直接上传素材,无需分离背景音</h1>" |
|
"</center>") |
|
with gr.Accordion("📒 关于此应用 (可折叠)", open=True): |
|
gr.Markdown("## <center>🏞️ - 滔滔AI,为您提供全场景的AI声音服务(如AI拟声、AI歌手、AI变声等)</center>") |
|
gr.Markdown("### <center>🥳 - 滔滔AI合作音乐人:[一清清清](https://space.bilibili.com/22960772?spm_id_from=333.337.0.0);AI歌手,唱我想唱!</center>") |
|
gr.Markdown("### <center>🎡 - 更多精彩,尽在[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕</center>") |
|
gr.Markdown("<center>💡 - 如何使用此程序:在页面上方选择“从B站视频上传”模块,填写视频网址和视频起止时间后,点击“让AI歌手开始演唱吧”按键即可!您还可以点击页面最下方的示例快速预览效果</center>") |
|
gr.Markdown(image_markdown) |
|
|
|
|
|
with gr.Tab("📺 - 从B站视频上传"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
inp1=gr.Textbox( |
|
label="Bilibili网址", info="请填写含有您喜欢歌曲的Bilibili网址,可直接填写相应的BV号", value="https://www.bilibili.com/video/BV..." |
|
) |
|
inp2=gr.Number(value=0, label="起始时间 (秒)") |
|
inp3=gr.Number(value=15, label="结束时间 (秒)") |
|
inp4=gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟") |
|
inp5=gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)") |
|
inp6=gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False) |
|
inp7=gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降") |
|
inp8=gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False) |
|
inp9=gr.Dropdown( |
|
choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], |
|
value=default_f0_method, |
|
label="模型推理方法 (crepe推理效果最好)", visible=False |
|
) |
|
btn1=gr.Button("让AI歌手开始演唱吧", variant="primary") |
|
with gr.Column(): |
|
out1=gr.Audio(label="AI歌手+伴奏🎶") |
|
out2=gr.Audio(label="人声部分🎤") |
|
out3=gr.Audio(label="伴奏部分🎵") |
|
|
|
btn1.click(fn=predict_song_from_yt, inputs=[inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8, inp9], outputs=[out1, out2, out3]) |
|
|
|
gr.Examples(examples=[["https://www.bilibili.com/video/BV1ip4y1p7Pn", 87, 103, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method]], |
|
inputs=[inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8, inp9], outputs=[out1, out2, out3], fn=predict_song_from_yt, cache_examples=True) |
|
|
|
with gr.Tab("🎙️ - 从麦克风上传"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
inp10=gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟") |
|
inp11=gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲") |
|
inp12=gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)") |
|
inp13=gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False) |
|
inp14=gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)") |
|
inp15=gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False) |
|
inp16=gr.Dropdown( |
|
choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], |
|
value=default_f0_method, |
|
label="模型推理方法 (crepe推理效果最好)", visible=False |
|
) |
|
btn2=gr.Button("让AI歌手开始演唱吧", variant="primary") |
|
with gr.Column(): |
|
out4=gr.Audio(label="AI歌手演唱🎶") |
|
|
|
btn2.click(fn=predict, inputs=[inp10, inp11, inp12, inp13, inp14, inp15, inp16], outputs=[out4]) |
|
|
|
|
|
with gr.Tab("🎵 - 从文件上传"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
inp17=gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟") |
|
inp18=gr.Audio(type="filepath", source="upload", label="请上传您想转换的歌曲 (仅人声部分)") |
|
inp19=gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)") |
|
inp20=gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False) |
|
inp21=gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)") |
|
inp22=gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False) |
|
inp23=gr.Dropdown( |
|
choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], |
|
value=default_f0_method, |
|
label="模型推理方法 (crepe推理效果最好)", visible=False |
|
) |
|
btn3=gr.Button("让AI歌手开始演唱吧", variant="primary") |
|
with gr.Column(): |
|
out5=gr.Audio(label="AI歌手演唱🎶") |
|
|
|
btn3.click(fn=predict, inputs=[inp17, inp18, inp19, inp20, inp21, inp22, inp23], outputs=[out5]) |
|
|
|
|
|
gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>") |
|
gr.HTML(''' |
|
<div class="footer"> |
|
<p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘 |
|
</p> |
|
</div> |
|
''') |
|
|
|
|
|
demo.launch(show_error=True) |